You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:42 UTC
[45/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
new file mode 100644
index 0000000..2a24a4e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
@@ -0,0 +1,506 @@
+#ifndef VIENNACL_COORDINATE_MATRIX_HPP_
+#define VIENNACL_COORDINATE_MATRIX_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/coordinate_matrix.hpp
+    @brief Implementation of the coordinate_matrix class
+*/
+
+#include <map>
+#include <vector>
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+
+
+//provide copy-operation:
+/** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+  *
+  * For the requirements on the CPUMatrixT type, see the documentation of the function copy(CPUMatrixT, compressed_matrix<>)
+  *
+  * @param cpu_matrix   A sparse matrix on the host.
+  * @param gpu_matrix   A compressed_matrix from ViennaCL
+  */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT & cpu_matrix,
+          coordinate_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+  assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  vcl_size_t group_num = 64;
+
+  // Step 1: Determine nonzeros:
+  if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+  {
+    vcl_size_t num_entries = 0;
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+        ++num_entries;
+
+    // Step 2: Set up matrix data:
+    gpu_matrix.nonzeros_ = num_entries;
+    gpu_matrix.rows_ = cpu_matrix.size1();
+    gpu_matrix.cols_ = cpu_matrix.size2();
+
+    viennacl::backend::typesafe_host_array<unsigned int> group_boundaries(gpu_matrix.handle3(), group_num + 1);
+    viennacl::backend::typesafe_host_array<unsigned int> coord_buffer(gpu_matrix.handle12(), 2*gpu_matrix.internal_nnz());
+    std::vector<NumericT> elements(gpu_matrix.internal_nnz());
+
+    vcl_size_t data_index = 0;
+    vcl_size_t current_fraction = 0;
+
+    group_boundaries.set(0, 0);
+    for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();  row_it != cpu_matrix.end1(); ++row_it)
+    {
+      for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+      {
+        coord_buffer.set(2*data_index, col_it.index1());
+        coord_buffer.set(2*data_index + 1, col_it.index2());
+        elements[data_index] = *col_it;
+        ++data_index;
+      }
+
+      while (data_index > vcl_size_t(static_cast<double>(current_fraction + 1) / static_cast<double>(group_num)) * num_entries)    //split data equally over 64 groups
+        group_boundaries.set(++current_fraction, data_index);
+    }
+
+    //write end of last group:
+    group_boundaries.set(group_num, data_index);
+    //group_boundaries[1] = data_index; //for one compute unit
+
+    //std::cout << "Group boundaries: " << std::endl;
+    //for (vcl_size_t i=0; i<group_boundaries.size(); ++i)
+    //  std::cout << group_boundaries[i] << std::endl;
+
+    viennacl::backend::memory_create(gpu_matrix.group_boundaries_, group_boundaries.raw_size(), traits::context(gpu_matrix.group_boundaries_), group_boundaries.get());
+    viennacl::backend::memory_create(gpu_matrix.coord_buffer_,         coord_buffer.raw_size(), traits::context(gpu_matrix.coord_buffer_),     coord_buffer.get());
+    viennacl::backend::memory_create(gpu_matrix.elements_,  sizeof(NumericT)*elements.size(), traits::context(gpu_matrix.elements_),         &(elements[0]));
+  }
+}
+
+/** @brief Copies a sparse matrix in the std::vector< std::map < > > format to an OpenCL device.
+  *
+  * @param cpu_matrix   A sparse square matrix on the host.
+  * @param gpu_matrix   A coordinate_matrix from ViennaCL
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const std::vector< std::map<unsigned int, NumericT> > & cpu_matrix,
+          coordinate_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+  vcl_size_t max_col = 0;
+  for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+  {
+    if (cpu_matrix[i].size() > 0)
+      max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+  }
+
+  viennacl::copy(tools::const_sparse_matrix_adapter<NumericT>(cpu_matrix, cpu_matrix.size(), max_col + 1), gpu_matrix);
+}
+
+//gpu to cpu:
+/** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+  *
+  * There are two type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+  * - resize(rows, cols)  A resize function to bring the matrix into the correct size
+  * - operator(i,j)       Write new entries via the parenthesis operator
+  *
+  * @param gpu_matrix   A coordinate_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host.
+  */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const coordinate_matrix<NumericT, AlignmentV> & gpu_matrix,
+          CPUMatrixT & cpu_matrix )
+{
+  assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+  assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+  if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+  {
+    //get raw data from memory:
+    viennacl::backend::typesafe_host_array<unsigned int> coord_buffer(gpu_matrix.handle12(), 2*gpu_matrix.nnz());
+    std::vector<NumericT> elements(gpu_matrix.nnz());
+
+    //std::cout << "GPU nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+    viennacl::backend::memory_read(gpu_matrix.handle12(), 0, coord_buffer.raw_size(), coord_buffer.get());
+    viennacl::backend::memory_read(gpu_matrix.handle(),   0, sizeof(NumericT) * elements.size(), &(elements[0]));
+
+    //fill the cpu_matrix:
+    for (vcl_size_t index = 0; index < gpu_matrix.nnz(); ++index)
+      cpu_matrix(coord_buffer[2*index], coord_buffer[2*index+1]) = elements[index];
+
+  }
+}
+
+/** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+  *
+  * @param gpu_matrix   A coordinate_matrix from ViennaCL
+  * @param cpu_matrix   A sparse matrix on the host.
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const coordinate_matrix<NumericT, AlignmentV> & gpu_matrix,
+          std::vector< std::map<unsigned int, NumericT> > & cpu_matrix)
+{
+  if (cpu_matrix.size() == 0)
+    cpu_matrix.resize(gpu_matrix.size1());
+
+  assert(cpu_matrix.size() == gpu_matrix.size1() && bool("Matrix dimension mismatch!"));
+
+  tools::sparse_matrix_adapter<NumericT> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
+  copy(gpu_matrix, temp);
+}
+
+
+//////////////////////// coordinate_matrix //////////////////////////
+/** @brief A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row and column indices and val denotes the entry.
+  *
+  * The present implementation of coordinate_matrix suffers from poor runtime efficiency. Users are adviced to use compressed_matrix in the meanwhile.
+  *
+  * @tparam NumericT    The floating point type (either float or double, checked at compile time)
+  * @tparam AlignmentV     The internal memory size for the arrays, given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two.
+  */
+template<class NumericT, unsigned int AlignmentV /* see forwards.h */ >
+class coordinate_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+  typedef vcl_size_t                                                                                 size_type;
+
+  /** @brief Default construction of a coordinate matrix. No memory is allocated */
+  coordinate_matrix() : rows_(0), cols_(0), nonzeros_(0), group_num_(64) {}
+
+  explicit coordinate_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzeros_(0), group_num_(64)
+  {
+    group_boundaries_.switch_active_handle_id(ctx.memory_type());
+    coord_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      group_boundaries_.opencl_handle().context(ctx.opencl_context());
+      coord_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+  }
+
+  /** @brief Construction of a coordinate matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+      *
+      * @param rows     Number of rows
+      * @param cols     Number of columns
+      * @param nonzeros Optional number of nonzeros for memory preallocation
+      * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+      */
+  coordinate_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context()) :
+    rows_(rows), cols_(cols), nonzeros_(nonzeros)
+  {
+    if (nonzeros > 0)
+    {
+      viennacl::backend::memory_create(group_boundaries_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (group_num_ + 1), ctx);
+      viennacl::backend::memory_create(coord_buffer_,     viennacl::backend::typesafe_host_array<unsigned int>().element_size() * 2 * internal_nnz(), ctx);
+      viennacl::backend::memory_create(elements_,         sizeof(NumericT) * internal_nnz(), ctx);
+    }
+    else
+    {
+      group_boundaries_.switch_active_handle_id(ctx.memory_type());
+      coord_buffer_.switch_active_handle_id(ctx.memory_type());
+      elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+      if (ctx.memory_type() == OPENCL_MEMORY)
+      {
+        group_boundaries_.opencl_handle().context(ctx.opencl_context());
+        coord_buffer_.opencl_handle().context(ctx.opencl_context());
+        elements_.opencl_handle().context(ctx.opencl_context());
+      }
+#endif
+    }
+  }
+
+  /** @brief Construction of a coordinate matrix with the supplied number of rows and columns in the supplied context. Does not yet allocate memory.
+      *
+      * @param rows     Number of rows
+      * @param cols     Number of columns
+      * @param ctx      Context in which to create the matrix
+      */
+  explicit coordinate_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+    : rows_(rows), cols_(cols), nonzeros_(0)
+  {
+    group_boundaries_.switch_active_handle_id(ctx.memory_type());
+    coord_buffer_.switch_active_handle_id(ctx.memory_type());
+    elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+    if (ctx.memory_type() == OPENCL_MEMORY)
+    {
+      group_boundaries_.opencl_handle().context(ctx.opencl_context());
+      coord_buffer_.opencl_handle().context(ctx.opencl_context());
+      elements_.opencl_handle().context(ctx.opencl_context());
+    }
+#endif
+  }
+
+
+  /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
+  void reserve(vcl_size_t new_nonzeros)
+  {
+    if (new_nonzeros > nonzeros_)  //TODO: Do we need to initialize new memory with zero?
+    {
+      handle_type coord_buffer_old;
+      handle_type elements_old;
+      viennacl::backend::memory_shallow_copy(coord_buffer_, coord_buffer_old);
+      viennacl::backend::memory_shallow_copy(elements_, elements_old);
+
+      vcl_size_t internal_new_nnz = viennacl::tools::align_to_multiple<vcl_size_t>(new_nonzeros, AlignmentV);
+      viennacl::backend::typesafe_host_array<unsigned int> size_deducer(coord_buffer_);
+      viennacl::backend::memory_create(coord_buffer_, size_deducer.element_size() * 2 * internal_new_nnz, viennacl::traits::context(coord_buffer_));
+      viennacl::backend::memory_create(elements_,     sizeof(NumericT)  * internal_new_nnz,             viennacl::traits::context(elements_));
+
+      viennacl::backend::memory_copy(coord_buffer_old, coord_buffer_, 0, 0, size_deducer.element_size() * 2 * nonzeros_);
+      viennacl::backend::memory_copy(elements_old,     elements_,     0, 0, sizeof(NumericT)  * nonzeros_);
+
+      nonzeros_ = new_nonzeros;
+    }
+  }
+
+  /** @brief Resize the matrix.
+      *
+      * @param new_size1    New number of rows
+      * @param new_size2    New number of columns
+      * @param preserve     If true, the old values are preserved. At present, old values are always discarded.
+      */
+  void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve = true)
+  {
+    assert (new_size1 > 0 && new_size2 > 0);
+
+    if (new_size1 < rows_ || new_size2 < cols_) //enlarge buffer
+    {
+      std::vector<std::map<unsigned int, NumericT> > stl_sparse_matrix;
+      if (rows_ > 0)
+        stl_sparse_matrix.resize(rows_);
+
+      if (preserve && rows_ > 0)
+        viennacl::copy(*this, stl_sparse_matrix);
+
+      stl_sparse_matrix.resize(new_size1);
+
+      //std::cout << "Cropping STL matrix of size " << stl_sparse_matrix.size() << std::endl;
+      if (new_size2 < cols_ && rows_ > 0)
+      {
+        for (vcl_size_t i=0; i<stl_sparse_matrix.size(); ++i)
+        {
+          std::list<unsigned int> to_delete;
+          for (typename std::map<unsigned int, NumericT>::iterator it = stl_sparse_matrix[i].begin();
+               it != stl_sparse_matrix[i].end();
+               ++it)
+          {
+            if (it->first >= new_size2)
+              to_delete.push_back(it->first);
+          }
+
+          for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
+            stl_sparse_matrix[i].erase(*it);
+        }
+        //std::cout << "Cropping done..." << std::endl;
+      }
+
+      rows_ = new_size1;
+      cols_ = new_size2;
+      viennacl::copy(stl_sparse_matrix, *this);
+    }
+
+    rows_ = new_size1;
+    cols_ = new_size2;
+  }
+
+  /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+  void clear()
+  {
+    viennacl::backend::typesafe_host_array<unsigned int> host_group_buffer(group_boundaries_, 65);
+    viennacl::backend::typesafe_host_array<unsigned int> host_coord_buffer(coord_buffer_, 2);
+    std::vector<NumericT> host_elements(1);
+
+    viennacl::backend::memory_create(group_boundaries_, host_group_buffer.element_size() * 65, viennacl::traits::context(group_boundaries_), host_group_buffer.get());
+    viennacl::backend::memory_create(coord_buffer_,     host_coord_buffer.element_size() * 2,   viennacl::traits::context(coord_buffer_),     host_coord_buffer.get());
+    viennacl::backend::memory_create(elements_,         sizeof(NumericT) * 1,                   viennacl::traits::context(elements_),         &(host_elements[0]));
+
+    nonzeros_ = 0;
+    group_num_ = 64;
+  }
+
+  /** @brief  Returns the number of rows */
+  vcl_size_t size1() const { return rows_; }
+  /** @brief  Returns the number of columns */
+  vcl_size_t size2() const { return cols_; }
+  /** @brief  Returns the number of nonzero entries */
+  vcl_size_t nnz() const { return nonzeros_; }
+  /** @brief  Returns the number of internal nonzero entries */
+  vcl_size_t internal_nnz() const { return viennacl::tools::align_to_multiple<vcl_size_t>(nonzeros_, AlignmentV); }
+
+  /** @brief  Returns the OpenCL handle to the (row, column) index array */
+  const handle_type & handle12() const { return coord_buffer_; }
+  /** @brief  Returns the OpenCL handle to the matrix entry array */
+  const handle_type & handle() const { return elements_; }
+  /** @brief  Returns the OpenCL handle to the group start index array */
+  const handle_type & handle3() const { return group_boundaries_; }
+
+  vcl_size_t groups() const { return group_num_; }
+
+#if defined(_MSC_VER) && _MSC_VER < 1500      //Visual Studio 2005 needs special treatment
+  template<typename CPUMatrixT>
+  friend void copy(const CPUMatrixT & cpu_matrix, coordinate_matrix & gpu_matrix );
+#else
+  template<typename CPUMatrixT, typename NumericT2, unsigned int AlignmentV2>
+  friend void copy(const CPUMatrixT & cpu_matrix, coordinate_matrix<NumericT2, AlignmentV2> & gpu_matrix );
+#endif
+
+private:
+  /** @brief Copy constructor is by now not available. */
+  coordinate_matrix(coordinate_matrix const &);
+
+  /** @brief Assignment is by now not available. */
+  coordinate_matrix & operator=(coordinate_matrix const &);
+
+
+  vcl_size_t rows_;
+  vcl_size_t cols_;
+  vcl_size_t nonzeros_;
+  vcl_size_t group_num_;
+  handle_type coord_buffer_;
+  handle_type elements_;
+  handle_type group_boundaries_;
+};
+
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x += A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs += temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x -= A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+        lhs -= temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x += A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x -= A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp b/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
new file mode 100644
index 0000000..c13ef01
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
@@ -0,0 +1,270 @@
+#ifndef VIENNACL_DETAIL_MATRIX_DEF_HPP_
+#define VIENNACL_DETAIL_MATRIX_DEF_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/detail/matrix_def.hpp
+    @brief Forward declaration of dense matrix classes
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+/** @brief Base class for representing matrices where the individual entries are not all stored explicitly, e.g. identity_matrix<>
+  *
+  * Examples are identity_matrix, scalar_matrix, and zero_matrix.
+  */
+template<typename NumericT>
+class implicit_matrix_base
+{
+protected:
+  typedef vcl_size_t        size_type;
+  implicit_matrix_base(size_type size1, size_type size2, NumericT value, bool diag, viennacl::context ctx) : size1_(size1), size2_(size2), value_(value), diag_(diag), off_diag_(0), ctx_(ctx){ }
+public:
+  typedef NumericT const & const_reference;
+  typedef NumericT cpu_value_type;
+
+  size_type size1() const { return size1_; }
+  size_type size2() const { return size2_; }
+  viennacl::context context() const { return ctx_; }
+  NumericT  value() const { return value_; }
+  bool diag() const { return diag_; }
+
+  const_reference operator()(size_type i, size_type j) const
+  {
+    if (diag_) return (i == j) ? value_ : off_diag_;
+    return value_;
+  }
+protected:
+  size_type size1_;
+  size_type size2_;
+  NumericT value_;
+  bool diag_;
+  NumericT off_diag_;
+  viennacl::context ctx_;
+};
+
+//
+// Initializer types
+//
+/** @brief Represents a vector consisting of 1 at a given index and zeros otherwise. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class identity_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+  typedef vcl_size_t         size_type;
+  typedef NumericT const & const_reference;
+
+  identity_matrix(size_type s, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s, s, 1, true, ctx){}
+};
+
+
+/** @brief Represents a vector consisting of zeros only. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class zero_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+  typedef vcl_size_t         size_type;
+  typedef NumericT const & const_reference;
+
+  zero_matrix(size_type s1, size_type s2, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s1, s2, 0, false, ctx){}
+};
+
+
+/** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class scalar_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+  typedef vcl_size_t         size_type;
+  typedef NumericT const & const_reference;
+
+  scalar_matrix(size_type s1, size_type s2, const_reference val, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s1, s2, val, false, ctx) {}
+};
+
+template<class NumericT, typename SizeT, typename DistanceT>
+class matrix_base
+{
+  typedef matrix_base<NumericT, SizeT, DistanceT>          self_type;
+public:
+
+  typedef matrix_iterator<row_iteration, self_type >   iterator1;
+  typedef matrix_iterator<col_iteration, self_type >   iterator2;
+  typedef scalar<NumericT>                                                  value_type;
+  typedef NumericT                                                          cpu_value_type;
+  typedef SizeT                                                            size_type;
+  typedef DistanceT                                                        difference_type;
+  typedef viennacl::backend::mem_handle                                       handle_type;
+
+  /** @brief The default constructor. Does not allocate any memory. */
+  explicit matrix_base(): size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0), row_major_fixed_(false), row_major_(true) {}
+
+  /** @brief The layout constructor. Does not allocate any memory. */
+  explicit matrix_base(bool is_row_major) : size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0), row_major_fixed_(true), row_major_(is_row_major) {}
+
+  /** @brief Creates the matrix with the given dimensions
+    *
+    * @param rows     Number of rows
+    * @param columns  Number of columns
+    * @param is_row_major  Boolean flag stating whether this matrix is stored row-major
+    * @param ctx      Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+    */
+  explicit matrix_base(size_type rows, size_type columns, bool is_row_major, viennacl::context ctx = viennacl::context());
+
+  /** @brief Constructor for creating a matrix_range or matrix_stride from some other matrix/matrix_range/matrix_stride */
+  explicit matrix_base(viennacl::backend::mem_handle & h,
+                       size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+                       size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+                       bool is_row_major): size1_(mat_size1), size2_(mat_size2),
+        start1_(mat_start1), start2_(mat_start2),
+        stride1_(mat_stride1), stride2_(mat_stride2),
+        internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2),
+        row_major_fixed_(true), row_major_(is_row_major),
+        elements_(h) {}
+
+
+  template<typename LHS, typename RHS, typename OP>
+  explicit matrix_base(matrix_expression<const LHS, const RHS, OP> const & proxy);
+
+  // CUDA or host memory:
+  explicit matrix_base(NumericT * ptr_to_mem, viennacl::memory_types mem_type,
+                       size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+                       size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+                       bool is_row_major);
+
+#ifdef VIENNACL_WITH_OPENCL
+  explicit matrix_base(cl_mem mem, size_type rows, size_type columns, bool is_row_major, viennacl::context ctx = viennacl::context());
+  explicit matrix_base(cl_mem mem, viennacl::context ctx,
+                       size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+                       size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+                       bool is_row_major);
+#endif
+
+  /* Copy CTOR */
+  matrix_base(const self_type & other);
+
+  /* Conversion CTOR */
+  template<typename OtherNumericT>
+  matrix_base(const matrix_base<OtherNumericT, SizeT, DistanceT> & other);
+
+  self_type & operator=(const self_type & other);
+  template<typename OtherNumericT>
+  self_type & operator=(const matrix_base<OtherNumericT, SizeT, DistanceT> & other);
+
+  /** @brief Implementation of the operation m1 = m2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+    * @param proxy  An expression template proxy class. */
+  template<typename LHS, typename RHS, typename OP>
+  self_type & operator=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+  // A = trans(B). Currently achieved in CPU memory
+  self_type & operator=(const matrix_expression< const self_type, const self_type, op_trans> & proxy);
+  template<typename LHS, typename RHS, typename OP>
+  self_type & operator+=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+  template<typename LHS, typename RHS, typename OP>
+  self_type & operator-=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+  /** @brief Assigns the supplied identity matrix to the matrix. */
+  self_type & operator = (identity_matrix<NumericT> const & m);
+  /** @brief Assigns the supplied zero matrix to the matrix. */
+  self_type & operator = (zero_matrix<NumericT> const & m);
+  /** @brief Assigns the supplied scalar vector to the matrix. */
+  self_type & operator = (scalar_matrix<NumericT> const & m);
+  //read-write access to an element of the matrix/matrix_range/matrix_slice
+  /** @brief Read-write access to a single element of the matrix/matrix_range/matrix_slice */
+  entry_proxy<NumericT> operator()(size_type row_index, size_type col_index);
+  /** @brief Read access to a single element of the matrix/matrix_range/matrix_slice */
+  const_entry_proxy<NumericT> operator()(size_type row_index, size_type col_index) const;
+  self_type & operator += (const self_type & other);
+  self_type & operator -= (const self_type & other);
+
+  /** @brief Scales the matrix by a char (8-bit integer) */
+  self_type & operator *= (char val);
+  /** @brief Scales the matrix by a short integer */
+  self_type & operator *= (short val);
+  /** @brief Scales the matrix by an integer */
+  self_type & operator *= (int val);
+  /** @brief Scales the matrix by a long integer */
+  self_type & operator *= (long val);
+  /** @brief Scales the matrix by a single precision floating point value */
+  self_type & operator *= (float val);
+  /** @brief Scales the matrix by a double precision floating point value */
+  self_type & operator *= (double val);
+
+  /** @brief Scales the matrix by a char (8-bit integer) */
+  self_type & operator /= (char val);
+  /** @brief Scales the matrix by a short integer */
+  self_type & operator /= (short val);
+  /** @brief Scales the matrix by an integer */
+  self_type & operator /= (int val);
+  /** @brief Scales the matrix by a long integer */
+  self_type & operator /= (long val);
+  /** @brief Scales the matrix by a single precision floating point value */
+  self_type & operator /= (float val);
+  /** @brief Scales the matrix by a double precision floating point value */
+  self_type & operator /= (double val);
+
+  /** @brief Sign flip for the matrix. Emulated to be equivalent to -1.0 * matrix */
+  matrix_expression<const self_type, const NumericT, op_mult> operator-() const;
+  /** @brief Returns the number of rows */
+  size_type size1() const { return size1_;}
+  /** @brief Returns the number of columns */
+  size_type size2() const { return size2_; }
+  /** @brief Returns the number of rows */
+  size_type start1() const { return start1_;}
+  /** @brief Returns the number of columns */
+  size_type start2() const { return start2_; }
+  /** @brief Returns the number of rows */
+  size_type stride1() const { return stride1_;}
+  /** @brief Returns the number of columns */
+  size_type stride2() const { return stride2_; }
+  /** @brief Resets all entries to zero */
+  void clear();
+  /** @brief Returns the internal number of rows. Usually required for launching OpenCL kernels only */
+  size_type internal_size1() const { return internal_size1_; }
+  /** @brief Returns the internal number of columns. Usually required for launching OpenCL kernels only */
+  size_type internal_size2() const { return internal_size2_; }
+  /** @brief Returns the total amount of allocated memory in multiples of sizeof(NumericT) */
+  size_type internal_size() const { return internal_size1() * internal_size2(); }
+  /** @brief Returns the OpenCL handle, non-const-version */
+  handle_type & handle()       { return elements_; }
+  /** @brief Returns the OpenCL handle, const-version */
+  const handle_type & handle() const { return elements_; }
+  viennacl::memory_types memory_domain() const { return elements_.get_active_handle_id(); }
+  bool row_major() const { return row_major_; }
+  void switch_memory_context(viennacl::context new_ctx) { viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx); }
+
+protected:
+  void set_handle(viennacl::backend::mem_handle const & h);
+  void resize(size_type rows, size_type columns, bool preserve = true);
+private:
+  size_type size1_;
+  size_type size2_;
+  size_type start1_;
+  size_type start2_;
+  size_type stride1_;
+  size_type stride2_;
+  size_type internal_size1_;
+  size_type internal_size2_;
+  bool row_major_fixed_; //helper flag to make layout of matrix<T, row_major> A; persistent
+  bool row_major_;
+  handle_type elements_;
+}; //matrix
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp b/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
new file mode 100644
index 0000000..4624b76
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
@@ -0,0 +1,349 @@
+#ifndef VIENNACL_DETAIL_VECTOR_DEF_HPP_
+#define VIENNACL_DETAIL_VECTOR_DEF_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/detail/vector_def.hpp
+    @brief Forward declarations of the implicit_vector_base, vector_base class.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+
+/** @brief Common base class for representing vectors where the entries are not all stored explicitly.
+  *
+  * Typical examples are zero_vector or scalar_vector.
+  */
+template<typename NumericT>
+class implicit_vector_base
+{
+protected:
+  implicit_vector_base(vcl_size_t s, vcl_size_t i, NumericT v, viennacl::context ctx) : size_(s), index_(std::make_pair(true,i)), value_(v), ctx_(ctx){ }
+  implicit_vector_base(vcl_size_t s, NumericT v, viennacl::context ctx) : size_(s), index_(std::make_pair(false,0)), value_(v), ctx_(ctx){ }
+
+public:
+  typedef NumericT const & const_reference;
+  typedef NumericT cpu_value_type;
+
+  viennacl::context context() const { return ctx_; }
+  vcl_size_t size() const { return size_; }
+  cpu_value_type  value() const { return value_; }
+  vcl_size_t index() const { return index_.second; }
+  bool has_index() const { return index_.first; }
+
+  cpu_value_type operator()(vcl_size_t i) const
+  {
+    if (index_.first)
+      return (i==index_.second)?value_:0;
+    return value_;
+  }
+
+  cpu_value_type operator[](vcl_size_t i) const
+  {
+    if (index_.first)
+      return (i==index_.second)?value_:0;
+    return
+        value_;
+  }
+
+protected:
+  vcl_size_t size_;
+  std::pair<bool, vcl_size_t> index_;
+  NumericT value_;
+  viennacl::context ctx_;
+};
+
+/** @brief Represents a vector consisting of 1 at a given index and zeros otherwise.*/
+template<typename NumericT>
+struct unit_vector : public implicit_vector_base<NumericT>
+{
+  unit_vector(vcl_size_t s, vcl_size_t ind, viennacl::context ctx = viennacl::context()) : implicit_vector_base<NumericT>(s, ind, 1, ctx)
+  {
+    assert( (ind < s) && bool("Provided index out of range!") );
+  }
+};
+
+
+/** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+struct scalar_vector : public implicit_vector_base<NumericT>
+{
+  scalar_vector(vcl_size_t s, NumericT val, viennacl::context ctx = viennacl::context()) : implicit_vector_base<NumericT>(s, val, ctx) {}
+};
+
+template<typename NumericT>
+struct zero_vector : public scalar_vector<NumericT>
+{
+  zero_vector(vcl_size_t s, viennacl::context ctx = viennacl::context()) : scalar_vector<NumericT>(s, 0, ctx){}
+};
+
+
+/** @brief Common base class for dense vectors, vector ranges, and vector slices.
+  *
+  * @tparam NumericT   The floating point type, either 'float' or 'double'
+  */
+template<class NumericT, typename SizeT /* see forwards.h for default type */, typename DistanceT /* see forwards.h for default type */>
+class vector_base
+{
+  typedef vector_base<NumericT, SizeT, DistanceT>         self_type;
+
+public:
+  typedef scalar<NumericT>                                value_type;
+  typedef NumericT                                        cpu_value_type;
+  typedef viennacl::backend::mem_handle                     handle_type;
+  typedef SizeT                                          size_type;
+  typedef DistanceT                                      difference_type;
+  typedef const_vector_iterator<NumericT, 1>              const_iterator;
+  typedef vector_iterator<NumericT, 1>                    iterator;
+
+  /** @brief Returns the length of the vector (cf. std::vector)  */
+  size_type size() const { return size_; }
+  /** @brief Returns the internal length of the vector, which is given by size() plus the extra memory due to padding the memory with zeros up to a multiple of 'AlignmentV' */
+  size_type internal_size() const { return internal_size_; }
+  /** @brief Returns the offset within the buffer  */
+  size_type start() const { return start_; }
+  /** @brief Returns the stride within the buffer (in multiples of sizeof(NumericT)) */
+  size_type stride() const { return stride_; }
+  /** @brief Returns true is the size is zero */
+  bool empty() const { return size_ == 0; }
+  /** @brief Returns the memory handle. */
+  const handle_type & handle() const { return elements_; }
+  /** @brief Returns the memory handle. */
+  handle_type & handle() { return elements_; }
+  viennacl::memory_types memory_domain() const { return elements_.get_active_handle_id();  }
+
+  /** @brief Default constructor in order to be compatible with various containers.
+    */
+  explicit vector_base();
+
+  /** @brief An explicit constructor for wrapping an existing vector into a vector_range or vector_slice.
+     *
+     * @param h          The existing memory handle from a vector/vector_range/vector_slice
+     * @param vec_size   The length (i.e. size) of the buffer
+     * @param vec_start  The offset from the beginning of the buffer identified by 'h'
+     * @param vec_stride Increment between two elements in the original buffer (in multiples of NumericT)
+    */
+  explicit vector_base(viennacl::backend::mem_handle & h, size_type vec_size, size_type vec_start, size_type vec_stride);
+
+  /** @brief Creates a vector and allocates the necessary memory */
+  explicit vector_base(size_type vec_size, viennacl::context ctx = viennacl::context());
+
+  // CUDA or host memory:
+  explicit vector_base(NumericT * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, vcl_size_t start = 0, size_type stride = 1);
+
+#ifdef VIENNACL_WITH_OPENCL
+  /** @brief Create a vector from existing OpenCL memory
+    *
+    * Note: The provided memory must take an eventual AlignmentV into account, i.e. existing_mem must be at least of size internal_size()!
+    * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
+    *
+    * @param existing_mem   An OpenCL handle representing the memory
+    * @param vec_size       The size of the vector.
+    */
+  explicit vector_base(cl_mem existing_mem, size_type vec_size, size_type start = 0, size_type stride = 1, viennacl::context ctx = viennacl::context());
+#endif
+
+  template<typename LHS, typename RHS, typename OP>
+  explicit vector_base(vector_expression<const LHS, const RHS, OP> const & proxy);
+
+  // Copy CTOR:
+  vector_base(const self_type & other);
+
+  // Conversion CTOR:
+  template<typename OtherNumericT>
+  vector_base(const vector_base<OtherNumericT> & v1);
+
+  /** @brief Assignment operator. Other vector needs to be of the same size, or this vector is not yet initialized.
+    */
+  self_type & operator=(const self_type & vec);
+  /** @brief Implementation of the operation v1 = v2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+    * @param proxy  An expression template proxy class.
+    */
+  template<typename LHS, typename RHS, typename OP>
+  self_type & operator=(const vector_expression<const LHS, const RHS, OP> & proxy);
+  /** @brief Converts a vector of a different numeric type to the current numeric type */
+  template<typename OtherNumericT>
+  self_type &  operator = (const vector_base<OtherNumericT> & v1);
+  /** @brief Creates the vector from the supplied unit vector. */
+  self_type & operator = (unit_vector<NumericT> const & v);
+  /** @brief Creates the vector from the supplied zero vector. */
+  self_type & operator = (zero_vector<NumericT> const & v);
+  /** @brief Creates the vector from the supplied scalar vector. */
+  self_type & operator = (scalar_vector<NumericT> const & v);
+
+
+  ///////////////////////////// Matrix Vector interaction start ///////////////////////////////////
+  /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a dense matrix.
+    * @param proxy An expression template proxy class
+    */
+  self_type & operator=(const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy);
+
+  //transposed_matrix_proxy:
+  /** @brief Operator overload for v1 = trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
+    * @param proxy An expression template proxy class
+    */
+  self_type & operator=(const vector_expression< const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans >,
+                        const vector_base<NumericT>,
+                        op_prod> & proxy);
+
+  ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
+
+
+  //read-write access to an element of the vector
+  /** @brief Read-write access to a single element of the vector */
+  entry_proxy<NumericT> operator()(size_type index);
+  /** @brief Read-write access to a single element of the vector */
+  entry_proxy<NumericT> operator[](size_type index);
+  /** @brief Read access to a single element of the vector */
+  const_entry_proxy<NumericT> operator()(size_type index) const;
+  /** @brief Read access to a single element of the vector */
+  const_entry_proxy<NumericT> operator[](size_type index) const;
+  self_type & operator += (const self_type & vec);
+  self_type & operator -= (const self_type & vec);
+
+  /** @brief Scales a vector (or proxy) by a char (8-bit integer) */
+  self_type & operator *= (char val);
+  /** @brief Scales a vector (or proxy) by a short integer */
+  self_type & operator *= (short val);
+  /** @brief Scales a vector (or proxy) by an integer */
+  self_type & operator *= (int val);
+  /** @brief Scales a vector (or proxy) by a long integer */
+  self_type & operator *= (long val);
+  /** @brief Scales a vector (or proxy) by a single precision floating point value */
+  self_type & operator *= (float val);
+  /** @brief Scales a vector (or proxy) by a double precision floating point value */
+  self_type & operator *= (double val);
+
+
+  /** @brief Scales a vector (or proxy) by a char (8-bit integer) */
+  self_type & operator /= (char val);
+  /** @brief Scales a vector (or proxy) by a short integer */
+  self_type & operator /= (short val);
+  /** @brief Scales a vector (or proxy) by an integer */
+  self_type & operator /= (int val);
+  /** @brief Scales a vector (or proxy) by a long integer */
+  self_type & operator /= (long val);
+  /** @brief Scales a vector (or proxy) by a single precision floating point value */
+  self_type & operator /= (float val);
+  /** @brief Scales a vector (or proxy) by a double precision floating point value */
+  self_type & operator /= (double val);
+
+  /** @brief Scales the vector by a char (8-bit integer) 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (char value) const;
+  /** @brief Scales the vector by a short integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (short value) const;
+  /** @brief Scales the vector by an integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (int value) const;
+  /** @brief Scales the vector by a long integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (long value) const;
+  /** @brief Scales the vector by a single precision floating point value 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (float value) const;
+  /** @brief Scales the vector by a double precision floating point value 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_mult>
+  operator * (double value) const;
+
+  /** @brief Scales the vector by a char (8-bit integer) 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (char value) const;
+  /** @brief Scales the vector by a short integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (short value) const;
+  /** @brief Scales the vector by an integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (int value) const;
+  /** @brief Scales the vector by a long integer 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (long value) const;
+  /** @brief Scales the vector by a single precision floating point value 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (float value) const;
+  /** @brief Scales the vector by a double precision floating point value 'alpha' and returns an expression template */
+  vector_expression< const self_type, const NumericT, op_div>
+  operator / (double value) const;
+
+  /** @brief Sign flip for the vector. Emulated to be equivalent to -1.0 * vector */
+  vector_expression<const self_type, const NumericT, op_mult> operator-() const;
+  /** @brief Returns an iterator pointing to the beginning of the vector  (STL like)*/
+  iterator begin();
+  /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
+  iterator end();
+  /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
+  const_iterator begin() const;
+  /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
+  const_iterator end() const;
+  /** @brief Swaps the entries of the two vectors */
+  self_type & swap(self_type & other);
+
+  /** @brief Resets all entries to zero. Does not change the size of the vector. */
+  void clear();
+
+protected:
+
+  void set_handle(viennacl::backend::mem_handle const & h) {  elements_ = h; }
+
+  /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy */
+  self_type & fast_swap(self_type & other);
+
+  /** @brief Pads vectors with alignment > 1 with trailing zeros if the internal size is larger than the visible size */
+  void pad();
+
+  void switch_memory_context(viennacl::context new_ctx);
+
+  //TODO: Think about implementing the following public member functions
+  //void insert_element(unsigned int i, NumericT val){}
+  //void erase_element(unsigned int i){}
+
+  //enlarge or reduce allocated memory and set unused memory to zero
+  /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'AlignmentV'
+    *
+    *  @param new_size  The new size of the vector
+    *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+    */
+  void resize(size_type new_size, bool preserve = true);
+
+  /** @brief Resizes the allocated memory for the vector. Convenience function for setting an OpenCL context in case reallocation is needed
+    *
+    *  @param new_size  The new size of the vector
+    *  @param ctx       The context within which the new memory should be allocated
+    *  @param preserve  If true, old entries of the vector are preserved, otherwise eventually discarded.
+    */
+  void resize(size_type new_size, viennacl::context ctx, bool preserve = true);
+private:
+
+  void resize_impl(size_type new_size, viennacl::context ctx, bool preserve = true);
+
+  size_type       size_;
+  size_type       start_;
+  size_type       stride_;
+  size_type       internal_size_;
+  handle_type elements_;
+}; //vector_base
+
+/** \endcond */
+
+} // namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
new file mode 100644
index 0000000..3b6ec76
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
@@ -0,0 +1,219 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_COMMON_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/device_specific/builtin_database/common.hpp
+*
+* Common routines such as device lookup for the built-in device database.
+*/
+
+#include "viennacl/ocl/device_utils.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/forwards.h"
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace builtin_database
+{
+
+using scheduler::FLOAT_TYPE;
+using scheduler::DOUBLE_TYPE;
+using namespace viennacl::ocl;
+
+template<class ParamT>
+class database_type
+{
+public:
+
+  //Because it would be too easy to use nested maps directly.
+  //THANKS, VISUAL STUDIO.
+  struct expression_t{ typedef std::map<scheduler::statement_node_numeric_type, ParamT> map_t; map_t d; };
+  struct device_name_t{ typedef std::map<device_name_type, expression_t> map_t; map_t d; };
+  struct device_architecture_t{ typedef std::map<ocl::device_architecture_family, device_name_t> map_t; map_t d; };
+  struct device_type_t{ typedef std::map<device_type, device_architecture_t> map_t; map_t d; };
+  struct type{ typedef std::map<vendor_id_type, device_type_t> map_t; map_t d; };
+  type map;
+
+  database_type<ParamT> & operator()(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, scheduler::statement_node_numeric_type p4, ParamT const & p5)
+  {
+    map.d[p0].d[p1].d[p2].d[p3].d.insert(std::make_pair(p4, p5));
+    return *this;
+  }
+
+  database_type<ParamT> & add_1B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+  {
+    return (*this)(p0, p1, p2, p3, scheduler::CHAR_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::UCHAR_TYPE, p5);
+  }
+
+  database_type<ParamT> & add_2B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+  {
+    return (*this)(p0, p1, p2, p3, scheduler::SHORT_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::USHORT_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::HALF_TYPE, p5);
+  }
+
+  database_type<ParamT> & add_4B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+  {
+    return (*this)(p0, p1, p2, p3, scheduler::INT_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::UINT_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::FLOAT_TYPE, p5);
+  }
+
+  database_type<ParamT> & add_8B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+  {
+    return (*this)(p0, p1, p2, p3, scheduler::LONG_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::ULONG_TYPE, p5)
+        (p0, p1, p2, p3, scheduler::DOUBLE_TYPE, p5);
+  }
+
+  ParamT const & at(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, scheduler::statement_node_numeric_type p4) const
+  {
+    return viennacl::device_specific::at(
+           viennacl::device_specific::at(
+           viennacl::device_specific::at(
+           viennacl::device_specific::at(
+             viennacl::device_specific::at(map.d, p0).d,
+           p1).d,
+           p2).d,
+         p3).d,
+         p4);
+  }
+
+
+};
+
+
+template<typename StringT>
+StringT get_mapped_device_name(StringT const & device_name, vendor_id_type vendor_id)
+{
+  if (vendor_id == viennacl::ocl::nvidia_id)
+  {
+    vcl_size_t found=0;
+    if ((found = device_name.find("GeForce",0)) != std::string::npos)
+    {
+      if ((found = device_name.find_first_of("123456789", found)) != std::string::npos)
+      {
+        switch (device_name[found]) // GeForce 400 series mapped to GTX 470, GeForce 500 series mapped to GTX 580:
+        {
+        case '4' : return "GeForce GTX 470";
+        case '5' : return "GeForce GTX 570";
+        default: break; // since there is only one Kepler and one Maxwell device in the database, fallback works properly
+        }
+      }
+    }
+    else if ((found = device_name.find("Tesla",0)) != std::string::npos) // map Kepler-based Teslas to K20m
+    {
+      if (device_name.find("Tesla C10",0) != std::string::npos)
+        return "Tesla C2050";
+      else if (device_name.find("Tesla S10",0) != std::string::npos)
+        return "Tesla C2050";
+      else if (device_name.find("Tesla M20",0) != std::string::npos)
+        return "Tesla C2050";
+      else if (device_name.find("Tesla S20",0) != std::string::npos)
+        return "Tesla C2050";
+      else if (device_name.find("Tesla K",0) != std::string::npos) // all Kepler-based Teslas
+        return "Tesla K20m";
+    }
+  }
+
+  return device_name;
+}
+
+/** @brief Get the profile for a device and a descriptor
+*
+* There are built-in defaults for CPUs, Accelerators, GPUs.
+*/
+template<class NumericT, class ParamT>
+inline ParamT const & get_parameters(database_type<ParamT> const & database, viennacl::ocl::device const & device)
+{
+  scheduler::statement_node_numeric_type numeric_type = scheduler::statement_node_numeric_type(scheduler::result_of::numeric_type_id<NumericT>::value);
+
+  device_type dev_type = device.type() & device_type(0xFE); // chop off 'default' characterization
+  vendor_id_type vendor_id = device.vendor_id();
+  ocl::device_architecture_family device_architecture = device.architecture_family();
+  std::string const & device_name = device.name();
+
+
+  /*-Vendor ID-*/
+  //  std::cout << "Looking up vendor ID..." << std::endl;
+  typename database_type<ParamT>::type::map_t::const_iterator vendor_it = database.map.d.find(vendor_id);
+  //Vendor not recognized =>  device type default
+  if (vendor_it==database.map.d.end())
+    return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+  /*-Device Type-*/
+  //  std::cout << "Looking up device type..." << std::endl;
+  typename database_type<ParamT>::device_type_t::map_t::const_iterator device_type_it = vendor_it->second.d.find(dev_type);
+  //Device type not recognized for this vendor => device type default
+  if (device_type_it==vendor_it->second.d.end())
+    return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+  /*-Device Architecture-*/
+  //  std::cout << "Looking up device architecture..." << std::endl;
+  typename database_type<ParamT>::device_architecture_t::map_t::const_iterator architecture_it = device_type_it->second.d.find(device_architecture);
+  //Architecture not found. We try to find the closest architecture available.
+  if (architecture_it==device_type_it->second.d.end())
+  {
+    typename database_type<ParamT>::device_architecture_t::map_t::const_iterator current_it = device_type_it->second.d.begin();
+    architecture_it = current_it;
+    int closest_arch = current_it->first - device_architecture;
+    while (current_it!=device_type_it->second.d.end())
+    {
+      int arch_diff = std::abs(static_cast<int>(current_it->first) - static_cast<int>(device_architecture));
+      if (arch_diff < closest_arch)
+      {
+        architecture_it = current_it;
+        closest_arch = arch_diff;
+      }
+      current_it++;
+    }
+  }
+
+  /*-Device Name-*/
+  std::string mapped_device_name = get_mapped_device_name(device_name, device.vendor_id());
+
+  typename database_type<ParamT>::device_name_t::map_t::const_iterator device_name_it = architecture_it->second.d.find(mapped_device_name);
+  //Name not found. We just take the first device for the architecture
+  if (device_name_it==architecture_it->second.d.end())
+  {
+    device_name_it = architecture_it->second.d.begin();
+  }
+
+  //  std::cout << "Looking up expression name.." << std::endl;
+  /*-Expression-*/
+  typename database_type<ParamT>::expression_t::map_t::const_iterator expression_it = device_name_it->second.d.find(numeric_type);
+  //Expression not found => Vendor default
+  if (expression_it==device_name_it->second.d.end())
+    return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+  //  std::cout << "Device found in the database! Getting profile..." << std::endl;
+  //Everything okay. Return specific profile//
+  return expression_it->second;
+}
+
+
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
new file mode 100644
index 0000000..5eede89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_ACCELERATOR_FALLBACK_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_ACCELERATOR_FALLBACK_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace accelerator{
+namespace fallback{
+
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
new file mode 100644
index 0000000..ffaa9db
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_CPU_FALLBACK_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_CPU_FALLBACK_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace cpu{
+namespace fallback{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
new file mode 100644
index 0000000..b0e3a1c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_CEDAR_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_CEDAR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace evergreen{
+namespace cedar{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
new file mode 100644
index 0000000..d1179b8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
@@ -0,0 +1,65 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace evergreen{
+namespace cypress{
+
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(4,32,4,8,4,1,4,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
new file mode 100644
index 0000000..2805a5c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_BARTS_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_BARTS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace barts{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,2,2,128,2,2,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,8,8,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,4,32));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,2,1,64,2,1,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,8,8,8,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
new file mode 100644
index 0000000..018839e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_DEVASTATOR_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_DEVASTATOR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace devastator{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,8,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,16,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(2,64,16,4,2,1,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,16,16,8,1,2,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
new file mode 100644
index 0000000..9e1db25
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_SCRAPPER_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_SCRAPPER_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace scrapper{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,8,16,32,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,8,16,8,2,2,1,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(2,32,2,4,2,1,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+  db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,16,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif