You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:39:58 UTC
[01/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Repository: mahout
Updated Branches:
refs/heads/master 1fca0743a -> f7c1f8026
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/qr-method-common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/qr-method-common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/qr-method-common.hpp
new file mode 100644
index 0000000..7250631
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/qr-method-common.hpp
@@ -0,0 +1,188 @@
+#ifndef VIENNACL_LINALG_QR_METHOD_COMMON_HPP
+#define VIENNACL_LINALG_QR_METHOD_COMMON_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <cmath>
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/linalg/opencl/kernels/svd.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/linalg/cuda/matrix_operations.hpp"
+#endif
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+//#include <boost/numeric/ublas/vector.hpp>
+//#include <boost/numeric/ublas/io.hpp>
+
+/** @file viennacl/linalg/qr-method-common.hpp
+ @brief Common routines used for the QR method and SVD. Experimental.
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+
+const std::string SVD_HOUSEHOLDER_UPDATE_QR_KERNEL = "house_update_QR";
+const std::string SVD_MATRIX_TRANSPOSE_KERNEL = "transpose_inplace";
+const std::string SVD_INVERSE_SIGNS_KERNEL = "inverse_signs";
+const std::string SVD_GIVENS_PREV_KERNEL = "givens_prev";
+const std::string SVD_FINAL_ITER_UPDATE_KERNEL = "final_iter_update";
+const std::string SVD_UPDATE_QR_COLUMN_KERNEL = "update_qr_column";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL = "house_update_A_left";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL = "house_update_A_right";
+const std::string SVD_HOUSEHOLDER_UPDATE_QL_KERNEL = "house_update_QL";
+
+namespace detail
+{
+static const double EPS = 1e-10;
+static const vcl_size_t ITER_MAX = 50;
+
+template <typename SCALARTYPE>
+SCALARTYPE pythag(SCALARTYPE a, SCALARTYPE b)
+{
+ return std::sqrt(a*a + b*b);
+}
+
+template <typename SCALARTYPE>
+SCALARTYPE sign(SCALARTYPE val)
+{
+ return (val >= 0) ? SCALARTYPE(1) : SCALARTYPE(-1);
+}
+
+// DEPRECATED: Replace with viennacl::linalg::norm_2
+template <typename VectorType>
+typename VectorType::value_type norm_lcl(VectorType const & x, vcl_size_t size)
+{
+ typename VectorType::value_type x_norm = 0.0;
+ for(vcl_size_t i = 0; i < size; i++)
+ x_norm += std::pow(x[i], 2);
+ return std::sqrt(x_norm);
+}
+
+template <typename VectorType>
+void normalize(VectorType & x, vcl_size_t size)
+{
+ typename VectorType::value_type x_norm = norm_lcl(x, size);
+ for(vcl_size_t i = 0; i < size; i++)
+ x[i] /= x_norm;
+}
+
+
+
+template <typename VectorType>
+void householder_vector(VectorType & v, vcl_size_t start)
+{
+ typedef typename VectorType::value_type ScalarType;
+ ScalarType x_norm = norm_lcl(v, v.size());
+ ScalarType alpha = -sign(v[start]) * x_norm;
+ v[start] += alpha;
+ normalize(v, v.size());
+}
+
+template <typename SCALARTYPE>
+void transpose(matrix_base<SCALARTYPE> & A)
+{
+ (void)A;
+#ifdef VIENNACL_WITH_OPENCL
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ if(A.row_major())
+ {
+ viennacl::linalg::opencl::kernels::svd<SCALARTYPE, row_major>::init(ctx);
+ viennacl::ocl::kernel & kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE, row_major>::program_name(), SVD_MATRIX_TRANSPOSE_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(A,
+ static_cast<cl_uint>(A.internal_size1()),
+ static_cast<cl_uint>(A.internal_size2())
+ )
+ );
+ }
+ else
+ {
+ viennacl::linalg::opencl::kernels::svd<SCALARTYPE, row_major>::init(ctx);
+ viennacl::ocl::kernel & kernel = viennacl::ocl::get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE, column_major>::program_name(), SVD_MATRIX_TRANSPOSE_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(A,
+ static_cast<cl_uint>(A.internal_size1()),
+ static_cast<cl_uint>(A.internal_size2())
+ )
+ );
+ }
+
+#endif
+}
+
+
+
+template <typename T>
+void cdiv(T xr, T xi, T yr, T yi, T& cdivr, T& cdivi)
+{
+ // Complex scalar division.
+ T r;
+ T d;
+ if (std::fabs(yr) > std::fabs(yi))
+ {
+ r = yi / yr;
+ d = yr + r * yi;
+ cdivr = (xr + r * xi) / d;
+ cdivi = (xi - r * xr) / d;
+ }
+ else
+ {
+ r = yr / yi;
+ d = yi + r * yr;
+ cdivr = (r * xr + xi) / d;
+ cdivi = (r * xi - xr) / d;
+ }
+}
+
+
+template<typename SCALARTYPE>
+void prepare_householder_vector(
+ matrix_base<SCALARTYPE>& A,
+ vector_base<SCALARTYPE>& D,
+ vcl_size_t size,
+ vcl_size_t row_start,
+ vcl_size_t col_start,
+ vcl_size_t start,
+ bool is_column
+ )
+{
+ //boost::numeric::ublas::vector<SCALARTYPE> tmp = boost::numeric::ublas::scalar_vector<SCALARTYPE>(size, 0);
+ std::vector<SCALARTYPE> tmp(size);
+ copy_vec(A, D, row_start, col_start, is_column);
+ fast_copy(D.begin(), D.begin() + vcl_ptrdiff_t(size - start), tmp.begin() + vcl_ptrdiff_t(start));
+
+ detail::householder_vector(tmp, start);
+ fast_copy(tmp, D);
+}
+
+} //detail
+}
+}
+
+#endif
[46/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/compressed_compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/compressed_compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/compressed_compressed_matrix.hpp
new file mode 100644
index 0000000..f1719a2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/compressed_compressed_matrix.hpp
@@ -0,0 +1,619 @@
+#ifndef VIENNACL_COMPRESSED_compressed_compressed_matrix_HPP_
+#define VIENNACL_COMPRESSED_compressed_compressed_matrix_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/compressed_compressed_matrix.hpp
+ @brief Implementation of the compressed_compressed_matrix class (CSR format with a relatively small number of nonzero rows)
+*/
+
+#include <vector>
+#include <list>
+#include <map>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+namespace detail
+{
+ template<typename CPUMatrixT, typename NumericT>
+ void copy_impl(const CPUMatrixT & cpu_matrix,
+ compressed_compressed_matrix<NumericT> & gpu_matrix,
+ vcl_size_t nonzero_rows,
+ vcl_size_t nonzeros)
+ {
+ assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), nonzero_rows + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> row_indices(gpu_matrix.handle3(), nonzero_rows);
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), nonzeros);
+ std::vector<NumericT> elements(nonzeros);
+
+ vcl_size_t row_index = 0;
+ vcl_size_t data_index = 0;
+
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();
+ row_it != cpu_matrix.end1();
+ ++row_it)
+ {
+ bool row_empty = true;
+
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ {
+ NumericT entry = *col_it;
+ if (entry < 0 || entry > 0) // entry != 0 without compiler warnings
+ {
+ if (row_empty)
+ {
+ assert(row_index < nonzero_rows && bool("Provided count of nonzero rows exceeded!"));
+
+ row_empty = false;
+ row_buffer.set(row_index, data_index);
+ row_indices.set(row_index, col_it.index1());
+ ++row_index;
+ }
+
+ col_buffer.set(data_index, col_it.index2());
+ elements[data_index] = entry;
+ ++data_index;
+ }
+ }
+ }
+ row_buffer.set(row_index, data_index);
+
+ gpu_matrix.set(row_buffer.get(),
+ row_indices.get(),
+ col_buffer.get(),
+ &elements[0],
+ cpu_matrix.size1(),
+ cpu_matrix.size2(),
+ nonzero_rows,
+ nonzeros);
+ }
+}
+
+//provide copy-operation:
+/** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+ *
+ * There are some type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+ * - .size1() returns the number of rows
+ * - .size2() returns the number of columns
+ * - const_iterator1 is a type definition for an iterator along increasing row indices
+ * - const_iterator2 is a type definition for an iterator along increasing columns indices
+ * - The const_iterator1 type provides an iterator of type const_iterator2 via members .begin() and .end() that iterates along column indices in the current row.
+ * - The types const_iterator1 and const_iterator2 provide members functions .index1() and .index2() that return the current row and column indices respectively.
+ * - Dereferenciation of an object of type const_iterator2 returns the entry.
+ *
+ * @param cpu_matrix A sparse matrix on the host.
+ * @param gpu_matrix A compressed_compressed_matrix from ViennaCL
+ */
+template<typename CPUMatrixT, typename NumericT>
+void copy(const CPUMatrixT & cpu_matrix,
+ compressed_compressed_matrix<NumericT> & gpu_matrix )
+{
+ //std::cout << "copy for (" << cpu_matrix.size1() << ", " << cpu_matrix.size2() << ", " << cpu_matrix.nnz() << ")" << std::endl;
+
+ if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+ {
+ //determine nonzero rows and total nonzeros:
+ vcl_size_t num_entries = 0;
+ vcl_size_t nonzero_rows = 0;
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();
+ row_it != cpu_matrix.end1();
+ ++row_it)
+ {
+ bool row_empty = true;
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ {
+ NumericT val = *col_it;
+ if (val < 0 || val > 0) // val != 0 without compiler warnings
+ {
+ ++num_entries;
+
+ if (row_empty)
+ {
+ row_empty = false;
+ ++nonzero_rows;
+ }
+ }
+ }
+ }
+
+ if (num_entries == 0) //we copy an empty matrix
+ num_entries = 1;
+
+ //set up matrix entries:
+ viennacl::detail::copy_impl(cpu_matrix, gpu_matrix, nonzero_rows, num_entries);
+ }
+}
+
+
+//adapted for std::vector< std::map < > > argument:
+/** @brief Copies a sparse square matrix in the std::vector< std::map < > > format to an OpenCL device. Use viennacl::tools::sparse_matrix_adapter for non-square matrices.
+ *
+ * @param cpu_matrix A sparse square matrix on the host using STL types
+ * @param gpu_matrix A compressed_compressed_matrix from ViennaCL
+ */
+template<typename SizeT, typename NumericT>
+void copy(const std::vector< std::map<SizeT, NumericT> > & cpu_matrix,
+ compressed_compressed_matrix<NumericT> & gpu_matrix )
+{
+ vcl_size_t nonzero_rows = 0;
+ vcl_size_t nonzeros = 0;
+ vcl_size_t max_col = 0;
+ for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+ {
+ if (cpu_matrix[i].size() > 0)
+ ++nonzero_rows;
+ nonzeros += cpu_matrix[i].size();
+ if (cpu_matrix[i].size() > 0)
+ max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+ }
+
+ viennacl::detail::copy_impl(tools::const_sparse_matrix_adapter<NumericT, SizeT>(cpu_matrix, cpu_matrix.size(), max_col + 1),
+ gpu_matrix,
+ nonzero_rows,
+ nonzeros);
+}
+
+
+//
+// gpu to cpu:
+//
+/** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+ *
+ * There are two type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+ * - resize(rows, cols) A resize function to bring the matrix into the correct size
+ * - operator(i,j) Write new entries via the parenthesis operator
+ *
+ * @param gpu_matrix A compressed_compressed_matrix from ViennaCL
+ * @param cpu_matrix A sparse matrix on the host.
+ */
+template<typename CPUMatrixT, typename NumericT>
+void copy(const compressed_compressed_matrix<NumericT> & gpu_matrix,
+ CPUMatrixT & cpu_matrix )
+{
+ assert( (cpu_matrix.size1() == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (cpu_matrix.size2() == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+ {
+ //get raw data from memory:
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.nnz1() + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> row_indices(gpu_matrix.handle1(), gpu_matrix.nnz1());
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+ std::vector<NumericT> elements(gpu_matrix.nnz());
+
+ //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+ viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle3(), 0, row_indices.raw_size(), row_indices.get());
+ viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT)* gpu_matrix.nnz(), &(elements[0]));
+
+ //fill the cpu_matrix:
+ vcl_size_t data_index = 0;
+ for (vcl_size_t i = 1; i < row_buffer.size(); ++i)
+ {
+ while (data_index < row_buffer[i])
+ {
+ if (col_buffer[data_index] >= gpu_matrix.size2())
+ {
+ std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl;
+ return;
+ }
+
+ NumericT val = elements[data_index];
+ if (val < 0 || val > 0) // val != 0 without compiler warning
+ cpu_matrix(row_indices[i-1], col_buffer[data_index]) = val;
+ ++data_index;
+ }
+ }
+ }
+}
+
+
+/** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+ *
+ * @param gpu_matrix A compressed_compressed_matrix from ViennaCL
+ * @param cpu_matrix A sparse matrix on the host.
+ */
+template<typename NumericT>
+void copy(const compressed_compressed_matrix<NumericT> & gpu_matrix,
+ std::vector< std::map<unsigned int, NumericT> > & cpu_matrix)
+{
+ tools::sparse_matrix_adapter<NumericT> temp(cpu_matrix, cpu_matrix.size(), cpu_matrix.size());
+ copy(gpu_matrix, temp);
+}
+
+
+//////////////////////// compressed_compressed_matrix //////////////////////////
+/** @brief A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows carry nonzero entries.
+ *
+ * The difference to the 'standard' CSR format is that there is an additional array 'row_indices' so that the i-th set of indices in the CSR-layout refers to row_indices[i].
+ *
+ * @tparam NumericT The floating point type (either float or double, checked at compile time)
+ * @tparam AlignmentV The internal memory size for the entries in each row is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+ */
+template<class NumericT>
+class compressed_compressed_matrix
+{
+public:
+ typedef viennacl::backend::mem_handle handle_type;
+ typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType> value_type;
+ typedef vcl_size_t size_type;
+
+ /** @brief Default construction of a compressed matrix. No memory is allocated */
+ compressed_compressed_matrix() : rows_(0), cols_(0), nonzero_rows_(0), nonzeros_(0) {}
+
+ /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+ *
+ * @param rows Number of rows
+ * @param cols Number of columns
+ * @param nonzero_rows Optional number of nonzero rows for memory preallocation
+ * @param nonzeros Optional number of nonzeros for memory preallocation
+ * @param ctx Context in which to create the matrix. Uses the default context if omitted
+ */
+ explicit compressed_compressed_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzero_rows = 0, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context())
+ : rows_(rows), cols_(cols), nonzero_rows_(nonzero_rows), nonzeros_(nonzeros)
+ {
+ row_buffer_.switch_active_handle_id(ctx.memory_type());
+ row_indices_.switch_active_handle_id(ctx.memory_type());
+ col_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ row_buffer_.opencl_handle().context(ctx.opencl_context());
+ row_indices_.opencl_handle().context(ctx.opencl_context());
+ col_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ if (rows > 0)
+ {
+ viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+ }
+ if (nonzeros > 0)
+ {
+ viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * nonzeros, ctx);
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * nonzeros, ctx);
+ }
+ }
+
+ /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+ *
+ * @param rows Number of rows
+ * @param cols Number of columns
+ * @param ctx Context in which to create the matrix
+ */
+ explicit compressed_compressed_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+ : rows_(rows), cols_(cols), nonzeros_(0)
+ {
+ row_buffer_.switch_active_handle_id(ctx.memory_type());
+ col_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ row_buffer_.opencl_handle().context(ctx.opencl_context());
+ col_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ if (rows > 0)
+ {
+ viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+ }
+ }
+
+ explicit compressed_compressed_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzero_rows_(0), nonzeros_(0)
+ {
+ row_buffer_.switch_active_handle_id(ctx.memory_type());
+ row_indices_.switch_active_handle_id(ctx.memory_type());
+ col_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ row_buffer_.opencl_handle().context(ctx.opencl_context());
+ row_indices_.opencl_handle().context(ctx.opencl_context());
+ col_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ }
+
+
+#ifdef VIENNACL_WITH_OPENCL
+ explicit compressed_compressed_matrix(cl_mem mem_row_buffer, cl_mem mem_row_indices, cl_mem mem_col_buffer, cl_mem mem_elements,
+ vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzero_rows, vcl_size_t nonzeros) :
+ rows_(rows), cols_(cols), nonzero_rows_(nonzero_rows), nonzeros_(nonzeros)
+ {
+ row_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ row_buffer_.opencl_handle() = mem_row_buffer;
+ row_buffer_.opencl_handle().inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+ row_buffer_.raw_size(sizeof(cl_uint) * (nonzero_rows + 1));
+
+ row_indices_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ row_indices_.opencl_handle() = mem_row_indices;
+ row_indices_.opencl_handle().inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+ row_indices_.raw_size(sizeof(cl_uint) * nonzero_rows);
+
+ col_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ col_buffer_.opencl_handle() = mem_col_buffer;
+ col_buffer_.opencl_handle().inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+ col_buffer_.raw_size(sizeof(cl_uint) * nonzeros);
+
+ elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ elements_.opencl_handle() = mem_elements;
+ elements_.opencl_handle().inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+ elements_.raw_size(sizeof(NumericT) * nonzeros);
+ }
+#endif
+
+
+ /** @brief Assignment a compressed matrix from possibly another memory domain. */
+ compressed_compressed_matrix & operator=(compressed_compressed_matrix const & other)
+ {
+ assert( (rows_ == 0 || rows_ == other.size1()) && bool("Size mismatch") );
+ assert( (cols_ == 0 || cols_ == other.size2()) && bool("Size mismatch") );
+
+ rows_ = other.size1();
+ cols_ = other.size2();
+ nonzero_rows_ = other.nnz1();
+ nonzeros_ = other.nnz();
+
+ viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_buffer_, row_buffer_);
+ viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_indices_, row_indices_);
+ viennacl::backend::typesafe_memory_copy<unsigned int>(other.col_buffer_, col_buffer_);
+ viennacl::backend::typesafe_memory_copy<NumericT>(other.elements_, elements_);
+
+ return *this;
+ }
+
+
+ /** @brief Sets the row, column and value arrays of the compressed matrix
+ *
+ * @param row_jumper Pointer to an array holding the indices of the first element of each row (starting with zero). E.g. row_jumper[10] returns the index of the first entry of the 11th row. The array length is 'cols + 1'
+ * @param row_indices Array holding the indices of the nonzero rows
+ * @param col_buffer Pointer to an array holding the column index of each entry. The array length is 'nonzeros'
+ * @param elements Pointer to an array holding the entries of the sparse matrix. The array length is 'elements'
+ * @param rows Number of rows of the sparse matrix
+ * @param cols Number of columns of the sparse matrix
+ * @param nonzero_rows Number of nonzero rows
+ * @param nonzeros Total number of nonzero entries
+ */
+ void set(const void * row_jumper,
+ const void * row_indices,
+ const void * col_buffer,
+ const NumericT * elements,
+ vcl_size_t rows,
+ vcl_size_t cols,
+ vcl_size_t nonzero_rows,
+ vcl_size_t nonzeros)
+ {
+ assert( (rows > 0) && bool("Error in compressed_compressed_matrix::set(): Number of rows must be larger than zero!"));
+ assert( (cols > 0) && bool("Error in compressed_compressed_matrix::set(): Number of columns must be larger than zero!"));
+ assert( (nonzero_rows > 0) && bool("Error in compressed_compressed_matrix::set(): Number of nonzero rows must be larger than zero!"));
+ assert( (nonzeros > 0) && bool("Error in compressed_compressed_matrix::set(): Number of nonzeros must be larger than zero!"));
+ //std::cout << "Setting memory: " << cols + 1 << ", " << nonzeros << std::endl;
+
+ viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>(row_buffer_).element_size() * (nonzero_rows + 1), viennacl::traits::context(row_buffer_), row_jumper);
+ viennacl::backend::memory_create(row_indices_, viennacl::backend::typesafe_host_array<unsigned int>(row_indices_).element_size() * nonzero_rows, viennacl::traits::context(row_indices_), row_indices);
+ viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>(col_buffer_).element_size() * nonzeros, viennacl::traits::context(col_buffer_), col_buffer);
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * nonzeros, viennacl::traits::context(elements_), elements);
+
+ nonzeros_ = nonzeros;
+ nonzero_rows_ = nonzero_rows;
+ rows_ = rows;
+ cols_ = cols;
+ }
+
+ /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+ void clear()
+ {
+ viennacl::backend::typesafe_host_array<unsigned int> host_row_buffer(row_buffer_, rows_ + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> host_row_indices(row_indices_, rows_ + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> host_col_buffer(col_buffer_, 1);
+ std::vector<NumericT> host_elements(1);
+
+ viennacl::backend::memory_create(row_buffer_, host_row_buffer.element_size() * (rows_ + 1), viennacl::traits::context(row_buffer_), host_row_buffer.get());
+ viennacl::backend::memory_create(row_indices_, host_row_indices.element_size() * (rows_ + 1), viennacl::traits::context(row_indices_), host_row_indices.get());
+ viennacl::backend::memory_create(col_buffer_, host_col_buffer.element_size() * 1, viennacl::traits::context(col_buffer_), host_col_buffer.get());
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * 1, viennacl::traits::context(elements_), &(host_elements[0]));
+
+ nonzeros_ = 0;
+ nonzero_rows_ = 0;
+ }
+
+ /** @brief Returns the number of rows */
+ const vcl_size_t & size1() const { return rows_; }
+ /** @brief Returns the number of columns */
+ const vcl_size_t & size2() const { return cols_; }
+ /** @brief Returns the number of nonzero entries */
+ const vcl_size_t & nnz1() const { return nonzero_rows_; }
+ /** @brief Returns the number of nonzero entries */
+ const vcl_size_t & nnz() const { return nonzeros_; }
+
+ /** @brief Returns the OpenCL handle to the row index array */
+ const handle_type & handle1() const { return row_buffer_; }
+ /** @brief Returns the OpenCL handle to the column index array */
+ const handle_type & handle2() const { return col_buffer_; }
+ /** @brief Returns the OpenCL handle to the row index array */
+ const handle_type & handle3() const { return row_indices_; }
+ /** @brief Returns the OpenCL handle to the matrix entry array */
+ const handle_type & handle() const { return elements_; }
+
+ /** @brief Returns the OpenCL handle to the row index array */
+ handle_type & handle1() { return row_buffer_; }
+ /** @brief Returns the OpenCL handle to the column index array */
+ handle_type & handle2() { return col_buffer_; }
+ /** @brief Returns the OpenCL handle to the row index array */
+ handle_type & handle3() { return row_indices_; }
+ /** @brief Returns the OpenCL handle to the matrix entry array */
+ handle_type & handle() { return elements_; }
+
+ void switch_memory_context(viennacl::context new_ctx)
+ {
+ viennacl::backend::switch_memory_context<unsigned int>(row_buffer_, new_ctx);
+ viennacl::backend::switch_memory_context<unsigned int>(row_indices_, new_ctx);
+ viennacl::backend::switch_memory_context<unsigned int>(col_buffer_, new_ctx);
+ viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx);
+ }
+
+ viennacl::memory_types memory_context() const
+ {
+ return row_buffer_.get_active_handle_id();
+ }
+
+private:
+
+ vcl_size_t rows_;
+ vcl_size_t cols_;
+ vcl_size_t nonzero_rows_;
+ vcl_size_t nonzeros_;
+ handle_type row_buffer_;
+ handle_type row_indices_;
+ handle_type col_buffer_;
+ handle_type elements_;
+};
+
+
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+ // x = A * y
+ template<typename T>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x = A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs = temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+ }
+ };
+
+ template<typename T>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x += A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs += temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+ }
+ };
+
+ template<typename T>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x -= A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs -= temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+ }
+ };
+
+
+ // x = A * vec_op
+ template<typename T, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs());
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_compressed_matrix<T>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs += temp_result;
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_compressed_matrix<T>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs -= temp_result;
+ }
+ };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/compressed_matrix.hpp
new file mode 100644
index 0000000..e42f552
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/compressed_matrix.hpp
@@ -0,0 +1,1178 @@
+#ifndef VIENNACL_COMPRESSED_MATRIX_HPP_
+#define VIENNACL_COMPRESSED_MATRIX_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/compressed_matrix.hpp
+ @brief Implementation of the compressed_matrix class
+*/
+
+#include <vector>
+#include <list>
+#include <map>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/entry_proxy.hpp"
+
+#ifdef VIENNACL_WITH_UBLAS
+#include <boost/numeric/ublas/matrix_sparse.hpp>
+#endif
+
+namespace viennacl
+{
+namespace detail
+{
+
+ /** @brief Implementation of the copy of a host-based sparse matrix to the device.
+ *
+ * See convenience copy() routines for type requirements of CPUMatrixT
+ */
+ template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+ void copy_impl(const CPUMatrixT & cpu_matrix,
+ compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+ vcl_size_t nonzeros)
+ {
+ assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), cpu_matrix.size1() + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), nonzeros);
+ std::vector<NumericT> elements(nonzeros);
+
+ vcl_size_t row_index = 0;
+ vcl_size_t data_index = 0;
+
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();
+ row_it != cpu_matrix.end1();
+ ++row_it)
+ {
+ row_buffer.set(row_index, data_index);
+ ++row_index;
+
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ {
+ col_buffer.set(data_index, col_it.index2());
+ elements[data_index] = *col_it;
+ ++data_index;
+ }
+ data_index = viennacl::tools::align_to_multiple<vcl_size_t>(data_index, AlignmentV); //take care of alignment
+ }
+ row_buffer.set(row_index, data_index);
+
+ gpu_matrix.set(row_buffer.get(),
+ col_buffer.get(),
+ &elements[0],
+ cpu_matrix.size1(),
+ cpu_matrix.size2(),
+ nonzeros);
+ }
+}
+
+//
+// host to device:
+//
+
+//provide copy-operation:
+/** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+ *
+ * There are some type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+ * - .size1() returns the number of rows
+ * - .size2() returns the number of columns
+ * - const_iterator1 is a type definition for an iterator along increasing row indices
+ * - const_iterator2 is a type definition for an iterator along increasing columns indices
+ * - The const_iterator1 type provides an iterator of type const_iterator2 via members .begin() and .end() that iterates along column indices in the current row.
+ * - The types const_iterator1 and const_iterator2 provide members functions .index1() and .index2() that return the current row and column indices respectively.
+ * - Dereferenciation of an object of type const_iterator2 returns the entry.
+ *
+ * @param cpu_matrix A sparse matrix on the host.
+ * @param gpu_matrix A compressed_matrix from ViennaCL
+ */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT & cpu_matrix,
+ compressed_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+ if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+ {
+ //determine nonzeros:
+ vcl_size_t num_entries = 0;
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1();
+ row_it != cpu_matrix.end1();
+ ++row_it)
+ {
+ vcl_size_t entries_per_row = 0;
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ {
+ ++entries_per_row;
+ }
+ num_entries += viennacl::tools::align_to_multiple<vcl_size_t>(entries_per_row, AlignmentV);
+ }
+
+ if (num_entries == 0) //we copy an empty matrix
+ num_entries = 1;
+
+ //set up matrix entries:
+ viennacl::detail::copy_impl(cpu_matrix, gpu_matrix, num_entries);
+ }
+}
+
+
+//adapted for std::vector< std::map < > > argument:
+/** @brief Copies a sparse square matrix in the std::vector< std::map < > > format to an OpenCL device. Use viennacl::tools::sparse_matrix_adapter for non-square matrices.
+ *
+ * @param cpu_matrix A sparse square matrix on the host using STL types
+ * @param gpu_matrix A compressed_matrix from ViennaCL
+ */
+template<typename SizeT, typename NumericT, unsigned int AlignmentV>
+void copy(const std::vector< std::map<SizeT, NumericT> > & cpu_matrix,
+ compressed_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+ vcl_size_t nonzeros = 0;
+ vcl_size_t max_col = 0;
+ for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+ {
+ if (cpu_matrix[i].size() > 0)
+ nonzeros += ((cpu_matrix[i].size() - 1) / AlignmentV + 1) * AlignmentV;
+ if (cpu_matrix[i].size() > 0)
+ max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+ }
+
+ viennacl::detail::copy_impl(tools::const_sparse_matrix_adapter<NumericT, SizeT>(cpu_matrix, cpu_matrix.size(), max_col + 1),
+ gpu_matrix,
+ nonzeros);
+}
+
+#ifdef VIENNACL_WITH_UBLAS
+/** @brief Convenience routine for copying a sparse uBLAS matrix to a ViennaCL matrix.
+ *
+ * Optimization which copies the data directly from the internal uBLAS buffers.
+ */
+template<typename ScalarType, typename F, vcl_size_t IB, typename IA, typename TA>
+void copy(const boost::numeric::ublas::compressed_matrix<ScalarType, F, IB, IA, TA> & ublas_matrix,
+ viennacl::compressed_matrix<ScalarType, 1> & gpu_matrix)
+{
+ assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(ublas_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(ublas_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ //we just need to copy the CSR arrays:
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), ublas_matrix.size1() + 1);
+ for (vcl_size_t i=0; i<=ublas_matrix.size1(); ++i)
+ row_buffer.set(i, ublas_matrix.index1_data()[i]);
+
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), ublas_matrix.nnz());
+ for (vcl_size_t i=0; i<ublas_matrix.nnz(); ++i)
+ col_buffer.set(i, ublas_matrix.index2_data()[i]);
+
+ gpu_matrix.set(row_buffer.get(),
+ col_buffer.get(),
+ &(ublas_matrix.value_data()[0]),
+ ublas_matrix.size1(),
+ ublas_matrix.size2(),
+ ublas_matrix.nnz());
+
+}
+#endif
+
+#ifdef VIENNACL_WITH_ARMADILLO
+/** @brief Convenience routine for copying a sparse Armadillo matrix to a ViennaCL matrix.
+ *
+ * Since Armadillo uses a column-major format, while ViennaCL uses row-major, we need to transpose.
+ * This is done fairly efficiently working on the CSR arrays directly, rather than (slowly) building an STL matrix.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(arma::SpMat<NumericT> const & arma_matrix,
+ viennacl::compressed_matrix<NumericT, AlignmentV> & vcl_matrix)
+{
+ assert( (vcl_matrix.size1() == 0 || static_cast<vcl_size_t>(arma_matrix.n_rows) == vcl_matrix.size1()) && bool("Size mismatch") );
+ assert( (vcl_matrix.size2() == 0 || static_cast<vcl_size_t>(arma_matrix.n_cols) == vcl_matrix.size2()) && bool("Size mismatch") );
+
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(vcl_matrix.handle1(), arma_matrix.n_rows + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(vcl_matrix.handle2(), arma_matrix.n_nonzero);
+ viennacl::backend::typesafe_host_array<NumericT > value_buffer(vcl_matrix.handle(), arma_matrix.n_nonzero);
+
+ // Step 1: Count number of nonzeros in each row
+ for (vcl_size_t col=0; col < static_cast<vcl_size_t>(arma_matrix.n_cols); ++col)
+ {
+ vcl_size_t col_begin = static_cast<vcl_size_t>(arma_matrix.col_ptrs[col]);
+ vcl_size_t col_end = static_cast<vcl_size_t>(arma_matrix.col_ptrs[col+1]);
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ unsigned int row = arma_matrix.row_indices[i];
+ row_buffer.set(row, row_buffer[row] + 1);
+ }
+ }
+
+ // Step 2: Exclusive scan on row_buffer to obtain offsets
+ unsigned int offset = 0;
+ for (vcl_size_t i=0; i<row_buffer.size(); ++i)
+ {
+ unsigned int tmp = row_buffer[i];
+ row_buffer.set(i, offset);
+ offset += tmp;
+ }
+
+ // Step 3: Fill data
+ std::vector<unsigned int> row_offsets(arma_matrix.n_rows);
+ for (vcl_size_t col=0; col < static_cast<vcl_size_t>(arma_matrix.n_cols); ++col)
+ {
+ vcl_size_t col_begin = static_cast<vcl_size_t>(arma_matrix.col_ptrs[col]);
+ vcl_size_t col_end = static_cast<vcl_size_t>(arma_matrix.col_ptrs[col+1]);
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ unsigned int row = arma_matrix.row_indices[i];
+ col_buffer.set(row_buffer[row] + row_offsets[row], col);
+ value_buffer.set(row_buffer[row] + row_offsets[row], arma_matrix.values[i]);
+ row_offsets[row] += 1;
+ }
+ }
+
+ vcl_matrix.set(row_buffer.get(), col_buffer.get(), reinterpret_cast<NumericT*>(value_buffer.get()),
+ arma_matrix.n_rows, arma_matrix.n_cols, arma_matrix.n_nonzero);
+}
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+/** @brief Convenience routine for copying a sparse Eigen matrix to a ViennaCL matrix.
+ *
+ * Builds a temporary STL matrix. Patches for avoiding the temporary matrix welcome.
+ */
+template<typename NumericT, int flags, unsigned int AlignmentV>
+void copy(const Eigen::SparseMatrix<NumericT, flags> & eigen_matrix,
+ compressed_matrix<NumericT, AlignmentV> & gpu_matrix)
+{
+ assert( (gpu_matrix.size1() == 0 || static_cast<vcl_size_t>(eigen_matrix.rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (gpu_matrix.size2() == 0 || static_cast<vcl_size_t>(eigen_matrix.cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ std::vector< std::map<unsigned int, NumericT> > stl_matrix(eigen_matrix.rows());
+
+ for (int k=0; k < eigen_matrix.outerSize(); ++k)
+ for (typename Eigen::SparseMatrix<NumericT, flags>::InnerIterator it(eigen_matrix, k); it; ++it)
+ stl_matrix[it.row()][it.col()] = it.value();
+
+ copy(tools::const_sparse_matrix_adapter<NumericT>(stl_matrix, eigen_matrix.rows(), eigen_matrix.cols()), gpu_matrix);
+}
+#endif
+
+
+#ifdef VIENNACL_WITH_MTL4
+/** @brief Convenience routine for copying a sparse MTL4 matrix to a ViennaCL matrix.
+ *
+ * Builds a temporary STL matrix for the copy. Patches for avoiding the temporary matrix welcome.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const mtl::compressed2D<NumericT> & cpu_matrix,
+ compressed_matrix<NumericT, AlignmentV> & gpu_matrix)
+{
+ assert( (gpu_matrix.size1() == 0 || static_cast<vcl_size_t>(cpu_matrix.num_rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (gpu_matrix.size2() == 0 || static_cast<vcl_size_t>(cpu_matrix.num_cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ typedef mtl::compressed2D<NumericT> MatrixType;
+
+ std::vector< std::map<unsigned int, NumericT> > stl_matrix(cpu_matrix.num_rows());
+
+ using mtl::traits::range_generator;
+ using mtl::traits::range::min;
+
+ // Choose between row and column traversal
+ typedef typename min<range_generator<mtl::tag::row, MatrixType>,
+ range_generator<mtl::tag::col, MatrixType> >::type range_type;
+ range_type my_range;
+
+ // Type of outer cursor
+ typedef typename range_type::type c_type;
+ // Type of inner cursor
+ typedef typename mtl::traits::range_generator<mtl::tag::nz, c_type>::type ic_type;
+
+ // Define the property maps
+ typename mtl::traits::row<MatrixType>::type row(cpu_matrix);
+ typename mtl::traits::col<MatrixType>::type col(cpu_matrix);
+ typename mtl::traits::const_value<MatrixType>::type value(cpu_matrix);
+
+ // Now iterate over the matrix
+ for (c_type cursor(my_range.begin(cpu_matrix)), cend(my_range.end(cpu_matrix)); cursor != cend; ++cursor)
+ for (ic_type icursor(mtl::begin<mtl::tag::nz>(cursor)), icend(mtl::end<mtl::tag::nz>(cursor)); icursor != icend; ++icursor)
+ stl_matrix[row(*icursor)][col(*icursor)] = value(*icursor);
+
+ copy(tools::const_sparse_matrix_adapter<NumericT>(stl_matrix, cpu_matrix.num_rows(), cpu_matrix.num_cols()), gpu_matrix);
+}
+#endif
+
+
+
+
+
+
+
+//
+// device to host:
+//
+/** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+ *
+ * There are two type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+ * - resize(rows, cols) A resize function to bring the matrix into the correct size
+ * - operator(i,j) Write new entries via the parenthesis operator
+ *
+ * @param gpu_matrix A compressed_matrix from ViennaCL
+ * @param cpu_matrix A sparse matrix on the host.
+ */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+ CPUMatrixT & cpu_matrix )
+{
+ assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+ {
+ //get raw data from memory:
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), cpu_matrix.size1() + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+ std::vector<NumericT> elements(gpu_matrix.nnz());
+
+ //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+ viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT)* gpu_matrix.nnz(), &(elements[0]));
+
+ //fill the cpu_matrix:
+ vcl_size_t data_index = 0;
+ for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+ {
+ while (data_index < row_buffer[row])
+ {
+ if (col_buffer[data_index] >= gpu_matrix.size2())
+ {
+ std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl;
+ return;
+ }
+
+ if (std::fabs(elements[data_index]) > static_cast<NumericT>(0))
+ cpu_matrix(row-1, static_cast<vcl_size_t>(col_buffer[data_index])) = elements[data_index];
+ ++data_index;
+ }
+ }
+ }
+}
+
+
+/** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+ *
+ * @param gpu_matrix A compressed_matrix from ViennaCL
+ * @param cpu_matrix A sparse matrix on the host.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+ std::vector< std::map<unsigned int, NumericT> > & cpu_matrix)
+{
+ assert( (cpu_matrix.size() == gpu_matrix.size1()) && bool("Size mismatch") );
+
+ tools::sparse_matrix_adapter<NumericT> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
+ copy(gpu_matrix, temp);
+}
+
+#ifdef VIENNACL_WITH_UBLAS
+/** @brief Convenience routine for copying a ViennaCL sparse matrix back to a sparse uBLAS matrix
+ *
+ * Directly populates the internal buffer of the uBLAS matrix, thus avoiding a temporary STL matrix.
+ */
+template<typename ScalarType, unsigned int AlignmentV, typename F, vcl_size_t IB, typename IA, typename TA>
+void copy(viennacl::compressed_matrix<ScalarType, AlignmentV> const & gpu_matrix,
+ boost::numeric::ublas::compressed_matrix<ScalarType> & ublas_matrix)
+{
+ assert( (viennacl::traits::size1(ublas_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (viennacl::traits::size2(ublas_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+
+ viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+
+ ublas_matrix.clear();
+ ublas_matrix.reserve(gpu_matrix.nnz());
+
+ ublas_matrix.set_filled(gpu_matrix.size1() + 1, gpu_matrix.nnz());
+
+ for (vcl_size_t i=0; i<ublas_matrix.size1() + 1; ++i)
+ ublas_matrix.index1_data()[i] = row_buffer[i];
+
+ for (vcl_size_t i=0; i<ublas_matrix.nnz(); ++i)
+ ublas_matrix.index2_data()[i] = col_buffer[i];
+
+ viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(ScalarType) * gpu_matrix.nnz(), &(ublas_matrix.value_data()[0]));
+
+}
+#endif
+
+#ifdef VIENNACL_WITH_ARMADILLO
+/** @brief Convenience routine for copying a ViennaCL sparse matrix back to a sparse Armadillo matrix.
+ *
+ * Performance notice: Inserting the row-major data from the ViennaCL matrix to the column-major Armadillo-matrix is likely to be slow.
+ * However, since this operation is unlikely to be performance-critical, further optimizations are postponed.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void copy(viennacl::compressed_matrix<NumericT, AlignmentV> & vcl_matrix,
+ arma::SpMat<NumericT> & arma_matrix)
+{
+ assert( (static_cast<vcl_size_t>(arma_matrix.n_rows) == vcl_matrix.size1()) && bool("Size mismatch") );
+ assert( (static_cast<vcl_size_t>(arma_matrix.n_cols) == vcl_matrix.size2()) && bool("Size mismatch") );
+
+ if ( vcl_matrix.size1() > 0 && vcl_matrix.size2() > 0 )
+ {
+ //get raw data from memory:
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(vcl_matrix.handle1(), vcl_matrix.size1() + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(vcl_matrix.handle2(), vcl_matrix.nnz());
+ viennacl::backend::typesafe_host_array<NumericT> elements (vcl_matrix.handle(), vcl_matrix.nnz());
+
+ viennacl::backend::memory_read(vcl_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ viennacl::backend::memory_read(vcl_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+ viennacl::backend::memory_read(vcl_matrix.handle(), 0, elements.raw_size(), elements.get());
+
+ arma_matrix.zeros();
+ vcl_size_t data_index = 0;
+ for (vcl_size_t row = 1; row <= vcl_matrix.size1(); ++row)
+ {
+ while (data_index < row_buffer[row])
+ {
+ assert(col_buffer[data_index] < vcl_matrix.size2() && bool("ViennaCL encountered invalid data at col_buffer"));
+ if (elements[data_index] != static_cast<NumericT>(0.0))
+ arma_matrix(row-1, col_buffer[data_index]) = elements[data_index];
+ ++data_index;
+ }
+ }
+ }
+}
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+/** @brief Convenience routine for copying a ViennaCL sparse matrix back to a sparse Eigen matrix */
+template<typename NumericT, int flags, unsigned int AlignmentV>
+void copy(compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+ Eigen::SparseMatrix<NumericT, flags> & eigen_matrix)
+{
+ assert( (static_cast<vcl_size_t>(eigen_matrix.rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (static_cast<vcl_size_t>(eigen_matrix.cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+ {
+ //get raw data from memory:
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+ std::vector<NumericT> elements(gpu_matrix.nnz());
+
+ viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT)* gpu_matrix.nnz(), &(elements[0]));
+
+ eigen_matrix.setZero();
+ vcl_size_t data_index = 0;
+ for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+ {
+ while (data_index < row_buffer[row])
+ {
+ assert(col_buffer[data_index] < gpu_matrix.size2() && bool("ViennaCL encountered invalid data at col_buffer"));
+ if (elements[data_index] != static_cast<NumericT>(0.0))
+ eigen_matrix.insert(row-1, col_buffer[data_index]) = elements[data_index];
+ ++data_index;
+ }
+ }
+ }
+}
+#endif
+
+
+
+#ifdef VIENNACL_WITH_MTL4
+/** @brief Convenience routine for copying a ViennaCL sparse matrix back to a sparse MTL4 matrix */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(compressed_matrix<NumericT, AlignmentV> & gpu_matrix,
+ mtl::compressed2D<NumericT> & mtl4_matrix)
+{
+ assert( (static_cast<vcl_size_t>(mtl4_matrix.num_rows()) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (static_cast<vcl_size_t>(mtl4_matrix.num_cols()) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+ {
+
+ //get raw data from memory:
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(gpu_matrix.handle1(), gpu_matrix.size1() + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> col_buffer(gpu_matrix.handle2(), gpu_matrix.nnz());
+ std::vector<NumericT> elements(gpu_matrix.nnz());
+
+ viennacl::backend::memory_read(gpu_matrix.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle2(), 0, col_buffer.raw_size(), col_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT)* gpu_matrix.nnz(), &(elements[0]));
+
+ //set_to_zero(mtl4_matrix);
+ //mtl4_matrix.change_dim(gpu_matrix.size1(), gpu_matrix.size2());
+
+ mtl::matrix::inserter< mtl::compressed2D<NumericT> > ins(mtl4_matrix);
+ vcl_size_t data_index = 0;
+ for (vcl_size_t row = 1; row <= gpu_matrix.size1(); ++row)
+ {
+ while (data_index < row_buffer[row])
+ {
+ assert(col_buffer[data_index] < gpu_matrix.size2() && bool("ViennaCL encountered invalid data at col_buffer"));
+ if (elements[data_index] != static_cast<NumericT>(0.0))
+ ins(row-1, col_buffer[data_index]) << typename mtl::Collection< mtl::compressed2D<NumericT> >::value_type(elements[data_index]);
+ ++data_index;
+ }
+ }
+ }
+}
+#endif
+
+
+
+
+
+//////////////////////// compressed_matrix //////////////////////////
+/** @brief A sparse square matrix in compressed sparse rows format.
+ *
+ * @tparam NumericT The floating point type (either float or double, checked at compile time)
+ * @tparam AlignmentV The internal memory size for the entries in each row is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+ */
+template<class NumericT, unsigned int AlignmentV /* see VCLForwards.h */>
+class compressed_matrix
+{
+public:
+ typedef viennacl::backend::mem_handle handle_type;
+ typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType> value_type;
+ typedef vcl_size_t size_type;
+
+ /** @brief Default construction of a compressed matrix. No memory is allocated */
+ compressed_matrix() : rows_(0), cols_(0), nonzeros_(0), row_block_num_(0) {}
+
+ /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+ *
+ * @param rows Number of rows
+ * @param cols Number of columns
+ * @param nonzeros Optional number of nonzeros for memory preallocation
+ * @param ctx Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+ explicit compressed_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context())
+ : rows_(rows), cols_(cols), nonzeros_(nonzeros), row_block_num_(0)
+ {
+ row_buffer_.switch_active_handle_id(ctx.memory_type());
+ col_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+ row_blocks_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ row_buffer_.opencl_handle().context(ctx.opencl_context());
+ col_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ row_blocks_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ if (rows > 0)
+ {
+ viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+ viennacl::vector_base<unsigned int> init_temporary(row_buffer_, size_type(rows+1), 0, 1);
+ init_temporary = viennacl::zero_vector<unsigned int>(size_type(rows+1), ctx);
+ }
+ if (nonzeros > 0)
+ {
+ viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * nonzeros, ctx);
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * nonzeros, ctx);
+ }
+ }
+
+ /** @brief Construction of a compressed matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+ *
+ * @param rows Number of rows
+ * @param cols Number of columns
+ * @param ctx Context in which to create the matrix
+ */
+ explicit compressed_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+ : rows_(rows), cols_(cols), nonzeros_(0), row_block_num_(0)
+ {
+ row_buffer_.switch_active_handle_id(ctx.memory_type());
+ col_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+ row_blocks_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ row_buffer_.opencl_handle().context(ctx.opencl_context());
+ col_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ row_blocks_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ if (rows > 0)
+ {
+ viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (rows + 1), ctx);
+ viennacl::vector_base<unsigned int> init_temporary(row_buffer_, size_type(rows+1), 0, 1);
+ init_temporary = viennacl::zero_vector<unsigned int>(size_type(rows+1), ctx);
+ }
+ }
+
+ /** @brief Creates an empty compressed_matrix, but sets the respective context information.
+ *
+ * This is useful if you want to want to populate e.g. a viennacl::compressed_matrix<> on the host with copy(), but the default backend is OpenCL.
+ */
+ explicit compressed_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzeros_(0), row_block_num_(0)
+ {
+ row_buffer_.switch_active_handle_id(ctx.memory_type());
+ col_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+ row_blocks_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ row_buffer_.opencl_handle().context(ctx.opencl_context());
+ col_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ row_blocks_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ }
+
+
+#ifdef VIENNACL_WITH_OPENCL
+ /** @brief Wraps existing OpenCL buffers holding the compressed sparse row information.
+ *
+ * @param mem_row_buffer A buffer consisting of unsigned integers (cl_uint) holding the entry points for each row (0-based indexing). (rows+1) elements, the last element being 'nonzeros'.
+ * @param mem_col_buffer A buffer consisting of unsigned integers (cl_uint) holding the column index for each nonzero entry as stored in 'mem_elements'.
+ * @param mem_elements A buffer holding the floating point numbers for nonzeros. OpenCL type of elements must match the template 'NumericT'.
+ * @param rows Number of rows in the matrix to be wrapped.
+ * @param cols Number of columns to be wrapped.
+ * @param nonzeros Number of nonzero entries in the matrix.
+ */
+ explicit compressed_matrix(cl_mem mem_row_buffer, cl_mem mem_col_buffer, cl_mem mem_elements,
+ vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros) :
+ rows_(rows), cols_(cols), nonzeros_(nonzeros), row_block_num_(0)
+ {
+ row_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ row_buffer_.opencl_handle() = mem_row_buffer;
+ row_buffer_.opencl_handle().inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+ row_buffer_.raw_size(sizeof(cl_uint) * (rows + 1));
+
+ col_buffer_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ col_buffer_.opencl_handle() = mem_col_buffer;
+ col_buffer_.opencl_handle().inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+ col_buffer_.raw_size(sizeof(cl_uint) * nonzeros);
+
+ elements_.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ elements_.opencl_handle() = mem_elements;
+ elements_.opencl_handle().inc(); //prevents that the user-provided memory is deleted once the matrix object is destroyed.
+ elements_.raw_size(sizeof(NumericT) * nonzeros);
+
+ //generate block information for CSR-adaptive:
+ generate_row_block_information();
+ }
+#endif
+
+ /** @brief Assignment a compressed matrix from the product of two compressed_matrix objects (C = A * B). */
+ compressed_matrix(matrix_expression<const compressed_matrix, const compressed_matrix, op_prod> const & proxy)
+ : rows_(0), cols_(0), nonzeros_(0), row_block_num_(0)
+ {
+ viennacl::context ctx = viennacl::traits::context(proxy.lhs());
+
+ row_buffer_.switch_active_handle_id(ctx.memory_type());
+ col_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+ row_blocks_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ row_buffer_.opencl_handle().context(ctx.opencl_context());
+ col_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ row_blocks_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+ generate_row_block_information();
+ }
+
+ /** @brief Assignment a compressed matrix from possibly another memory domain. */
+ compressed_matrix & operator=(compressed_matrix const & other)
+ {
+ assert( (rows_ == 0 || rows_ == other.size1()) && bool("Size mismatch") );
+ assert( (cols_ == 0 || cols_ == other.size2()) && bool("Size mismatch") );
+
+ rows_ = other.size1();
+ cols_ = other.size2();
+ nonzeros_ = other.nnz();
+ row_block_num_ = other.row_block_num_;
+
+ viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_buffer_, row_buffer_);
+ viennacl::backend::typesafe_memory_copy<unsigned int>(other.col_buffer_, col_buffer_);
+ viennacl::backend::typesafe_memory_copy<unsigned int>(other.row_blocks_, row_blocks_);
+ viennacl::backend::typesafe_memory_copy<NumericT>(other.elements_, elements_);
+
+ return *this;
+ }
+
+ /** @brief Assignment a compressed matrix from the product of two compressed_matrix objects (C = A * B). */
+ compressed_matrix & operator=(matrix_expression<const compressed_matrix, const compressed_matrix, op_prod> const & proxy)
+ {
+ assert( (rows_ == 0 || rows_ == proxy.lhs().size1()) && bool("Size mismatch") );
+ assert( (cols_ == 0 || cols_ == proxy.rhs().size2()) && bool("Size mismatch") );
+
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), *this);
+ generate_row_block_information();
+
+ return *this;
+ }
+
+
+ /** @brief Sets the row, column and value arrays of the compressed matrix
+ *
+ * Type of row_jumper and col_buffer is 'unsigned int' for CUDA and OpenMP (host) backend, but *must* be cl_uint for OpenCL.
+ * The reason is that 'unsigned int' might have a different bit representation on the host than 'unsigned int' on the OpenCL device.
+ * cl_uint is guaranteed to have the correct bit representation for OpenCL devices.
+ *
+ * @param row_jumper Pointer to an array holding the indices of the first element of each row (starting with zero). E.g. row_jumper[10] returns the index of the first entry of the 11th row. The array length is 'cols + 1'
+ * @param col_buffer Pointer to an array holding the column index of each entry. The array length is 'nonzeros'
+ * @param elements Pointer to an array holding the entries of the sparse matrix. The array length is 'elements'
+ * @param rows Number of rows of the sparse matrix
+ * @param cols Number of columns of the sparse matrix
+ * @param nonzeros Number of nonzeros
+ */
+ void set(const void * row_jumper,
+ const void * col_buffer,
+ const NumericT * elements,
+ vcl_size_t rows,
+ vcl_size_t cols,
+ vcl_size_t nonzeros)
+ {
+ assert( (rows > 0) && bool("Error in compressed_matrix::set(): Number of rows must be larger than zero!"));
+ assert( (cols > 0) && bool("Error in compressed_matrix::set(): Number of columns must be larger than zero!"));
+ assert( (nonzeros > 0) && bool("Error in compressed_matrix::set(): Number of nonzeros must be larger than zero!"));
+ //std::cout << "Setting memory: " << cols + 1 << ", " << nonzeros << std::endl;
+
+ //row_buffer_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+ viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>(row_buffer_).element_size() * (rows + 1), viennacl::traits::context(row_buffer_), row_jumper);
+
+ //col_buffer_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+ viennacl::backend::memory_create(col_buffer_, viennacl::backend::typesafe_host_array<unsigned int>(col_buffer_).element_size() * nonzeros, viennacl::traits::context(col_buffer_), col_buffer);
+
+ //elements_.switch_active_handle_id(viennacl::backend::OPENCL_MEMORY);
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * nonzeros, viennacl::traits::context(elements_), elements);
+
+ nonzeros_ = nonzeros;
+ rows_ = rows;
+ cols_ = cols;
+
+ //generate block information for CSR-adaptive:
+ generate_row_block_information();
+ }
+
+ /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
+ void reserve(vcl_size_t new_nonzeros, bool preserve = true)
+ {
+ if (new_nonzeros > nonzeros_)
+ {
+ if (preserve)
+ {
+ handle_type col_buffer_old;
+ handle_type elements_old;
+ viennacl::backend::memory_shallow_copy(col_buffer_, col_buffer_old);
+ viennacl::backend::memory_shallow_copy(elements_, elements_old);
+
+ viennacl::backend::typesafe_host_array<unsigned int> size_deducer(col_buffer_);
+ viennacl::backend::memory_create(col_buffer_, size_deducer.element_size() * new_nonzeros, viennacl::traits::context(col_buffer_));
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * new_nonzeros, viennacl::traits::context(elements_));
+
+ viennacl::backend::memory_copy(col_buffer_old, col_buffer_, 0, 0, size_deducer.element_size() * nonzeros_);
+ viennacl::backend::memory_copy(elements_old, elements_, 0, 0, sizeof(NumericT)* nonzeros_);
+ }
+ else
+ {
+ viennacl::backend::typesafe_host_array<unsigned int> size_deducer(col_buffer_);
+ viennacl::backend::memory_create(col_buffer_, size_deducer.element_size() * new_nonzeros, viennacl::traits::context(col_buffer_));
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * new_nonzeros, viennacl::traits::context(elements_));
+ }
+
+ nonzeros_ = new_nonzeros;
+ }
+ }
+
+ /** @brief Resize the matrix.
+ *
+ * @param new_size1 New number of rows
+ * @param new_size2 New number of columns
+ * @param preserve If true, the old values are preserved. At present, old values are always discarded.
+ */
+ void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve = true)
+ {
+ assert(new_size1 > 0 && new_size2 > 0 && bool("Cannot resize to zero size!"));
+
+ if (new_size1 != rows_ || new_size2 != cols_)
+ {
+ if (!preserve)
+ {
+ viennacl::backend::typesafe_host_array<unsigned int> host_row_buffer(row_buffer_, new_size1 + 1);
+ viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (new_size1 + 1), viennacl::traits::context(row_buffer_), host_row_buffer.get());
+ // faster version without initializing memory:
+ //viennacl::backend::memory_create(row_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (new_size1 + 1), viennacl::traits::context(row_buffer_));
+ nonzeros_ = 0;
+ }
+ else
+ {
+ std::vector<std::map<unsigned int, NumericT> > stl_sparse_matrix;
+ if (rows_ > 0)
+ {
+ stl_sparse_matrix.resize(rows_);
+ viennacl::copy(*this, stl_sparse_matrix);
+ } else {
+ stl_sparse_matrix.resize(new_size1);
+ stl_sparse_matrix[0][0] = 0; //enforces nonzero array sizes if matrix was initially empty
+ }
+
+ stl_sparse_matrix.resize(new_size1);
+
+ //discard entries with column index larger than new_size2
+ if (new_size2 < cols_ && rows_ > 0)
+ {
+ for (vcl_size_t i=0; i<stl_sparse_matrix.size(); ++i)
+ {
+ std::list<unsigned int> to_delete;
+ for (typename std::map<unsigned int, NumericT>::iterator it = stl_sparse_matrix[i].begin();
+ it != stl_sparse_matrix[i].end();
+ ++it)
+ {
+ if (it->first >= new_size2)
+ to_delete.push_back(it->first);
+ }
+
+ for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
+ stl_sparse_matrix[i].erase(*it);
+ }
+ }
+
+ viennacl::tools::sparse_matrix_adapter<NumericT> adapted_matrix(stl_sparse_matrix, new_size1, new_size2);
+ rows_ = new_size1;
+ cols_ = new_size2;
+ viennacl::copy(adapted_matrix, *this);
+ }
+
+ rows_ = new_size1;
+ cols_ = new_size2;
+ }
+ }
+
+ /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+ void clear()
+ {
+ viennacl::backend::typesafe_host_array<unsigned int> host_row_buffer(row_buffer_, rows_ + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> host_col_buffer(col_buffer_, 1);
+ std::vector<NumericT> host_elements(1);
+
+ viennacl::backend::memory_create(row_buffer_, host_row_buffer.element_size() * (rows_ + 1), viennacl::traits::context(row_buffer_), host_row_buffer.get());
+ viennacl::backend::memory_create(col_buffer_, host_col_buffer.element_size() * 1, viennacl::traits::context(col_buffer_), host_col_buffer.get());
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * 1, viennacl::traits::context(elements_), &(host_elements[0]));
+
+ nonzeros_ = 0;
+ }
+
+ /** @brief Returns a reference to the (i,j)-th entry of the sparse matrix. If (i,j) does not exist (zero), it is inserted (slow!) */
+ entry_proxy<NumericT> operator()(vcl_size_t i, vcl_size_t j)
+ {
+ assert( (i < rows_) && (j < cols_) && bool("compressed_matrix access out of bounds!"));
+
+ vcl_size_t index = element_index(i, j);
+
+ // check for element in sparsity pattern
+ if (index < nonzeros_)
+ return entry_proxy<NumericT>(index, elements_);
+
+ // Element not found. Copying required. Very slow, but direct entry manipulation is painful anyway...
+ std::vector< std::map<unsigned int, NumericT> > cpu_backup(rows_);
+ tools::sparse_matrix_adapter<NumericT> adapted_cpu_backup(cpu_backup, rows_, cols_);
+ viennacl::copy(*this, adapted_cpu_backup);
+ cpu_backup[i][static_cast<unsigned int>(j)] = 0.0;
+ viennacl::copy(adapted_cpu_backup, *this);
+
+ index = element_index(i, j);
+
+ assert(index < nonzeros_);
+
+ return entry_proxy<NumericT>(index, elements_);
+ }
+
+ /** @brief Returns the number of rows */
+ const vcl_size_t & size1() const { return rows_; }
+ /** @brief Returns the number of columns */
+ const vcl_size_t & size2() const { return cols_; }
+ /** @brief Returns the number of nonzero entries */
+ const vcl_size_t & nnz() const { return nonzeros_; }
+ /** @brief Returns the internal number of row blocks for an adaptive SpMV */
+ const vcl_size_t & blocks1() const { return row_block_num_; }
+
+ /** @brief Returns the OpenCL handle to the row index array */
+ const handle_type & handle1() const { return row_buffer_; }
+ /** @brief Returns the OpenCL handle to the column index array */
+ const handle_type & handle2() const { return col_buffer_; }
+ /** @brief Returns the OpenCL handle to the row block array */
+ const handle_type & handle3() const { return row_blocks_; }
+ /** @brief Returns the OpenCL handle to the matrix entry array */
+ const handle_type & handle() const { return elements_; }
+
+ /** @brief Returns the OpenCL handle to the row index array */
+ handle_type & handle1() { return row_buffer_; }
+ /** @brief Returns the OpenCL handle to the column index array */
+ handle_type & handle2() { return col_buffer_; }
+ /** @brief Returns the OpenCL handle to the row block array */
+ handle_type & handle3() { return row_blocks_; }
+ /** @brief Returns the OpenCL handle to the matrix entry array */
+ handle_type & handle() { return elements_; }
+
+ /** @brief Switches the memory context of the matrix.
+ *
+ * Allows for e.g. an migration of the full matrix from OpenCL memory to host memory for e.g. computing a preconditioner.
+ */
+ void switch_memory_context(viennacl::context new_ctx)
+ {
+ viennacl::backend::switch_memory_context<unsigned int>(row_buffer_, new_ctx);
+ viennacl::backend::switch_memory_context<unsigned int>(col_buffer_, new_ctx);
+ viennacl::backend::switch_memory_context<unsigned int>(row_blocks_, new_ctx);
+ viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx);
+ }
+
+ /** @brief Returns the current memory context to determine whether the matrix is set up for OpenMP, OpenCL, or CUDA. */
+ viennacl::memory_types memory_context() const
+ {
+ return row_buffer_.get_active_handle_id();
+ }
+
+private:
+
+ /** @brief Helper function for accessing the element (i,j) of the matrix. */
+ vcl_size_t element_index(vcl_size_t i, vcl_size_t j)
+ {
+ //read row indices
+ viennacl::backend::typesafe_host_array<unsigned int> row_indices(row_buffer_, 2);
+ viennacl::backend::memory_read(row_buffer_, row_indices.element_size()*i, row_indices.element_size()*2, row_indices.get());
+
+ //get column indices for row i:
+ viennacl::backend::typesafe_host_array<unsigned int> col_indices(col_buffer_, row_indices[1] - row_indices[0]);
+ viennacl::backend::memory_read(col_buffer_, col_indices.element_size()*row_indices[0], row_indices.element_size()*col_indices.size(), col_indices.get());
+
+ for (vcl_size_t k=0; k<col_indices.size(); ++k)
+ {
+ if (col_indices[k] == j)
+ return row_indices[0] + k;
+ }
+
+ // if not found, return index past the end of the matrix (cf. matrix.end() in the spirit of the STL)
+ return nonzeros_;
+ }
+
+public:
+ /** @brief Builds the row block information needed for fast sparse matrix-vector multiplications.
+ *
+ * Required when manually populating the memory buffers with values. Not necessary when using viennacl::copy() or .set()
+ */
+ void generate_row_block_information()
+ {
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(row_buffer_, rows_ + 1);
+ viennacl::backend::memory_read(row_buffer_, 0, row_buffer.raw_size(), row_buffer.get());
+
+ viennacl::backend::typesafe_host_array<unsigned int> row_blocks(row_buffer_, rows_ + 1);
+
+ vcl_size_t num_entries_in_current_batch = 0;
+
+ const vcl_size_t shared_mem_size = 1024; // number of column indices loaded to shared memory, number of floating point values loaded to shared memory
+
+ row_block_num_ = 0;
+ row_blocks.set(0, 0);
+ for (vcl_size_t i=0; i<rows_; ++i)
+ {
+ vcl_size_t entries_in_row = vcl_size_t(row_buffer[i+1]) - vcl_size_t(row_buffer[i]);
+ num_entries_in_current_batch += entries_in_row;
+
+ if (num_entries_in_current_batch > shared_mem_size)
+ {
+ vcl_size_t rows_in_batch = i - row_blocks[row_block_num_];
+ if (rows_in_batch > 0) // at least one full row is in the batch. Use current row in next batch.
+ row_blocks.set(++row_block_num_, i--);
+ else // row is larger than buffer in shared memory
+ row_blocks.set(++row_block_num_, i+1);
+ num_entries_in_current_batch = 0;
+ }
+ }
+ if (num_entries_in_current_batch > 0)
+ row_blocks.set(++row_block_num_, rows_);
+
+ if (row_block_num_ > 0) //matrix might be empty...
+ viennacl::backend::memory_create(row_blocks_,
+ row_blocks.element_size() * (row_block_num_ + 1),
+ viennacl::traits::context(row_buffer_), row_blocks.get());
+
+ }
+
+private:
+ // /** @brief Copy constructor is by now not available. */
+ //compressed_matrix(compressed_matrix const &);
+
+private:
+
+ vcl_size_t rows_;
+ vcl_size_t cols_;
+ vcl_size_t nonzeros_;
+ vcl_size_t row_block_num_;
+ handle_type row_buffer_;
+ handle_type row_blocks_;
+ handle_type col_buffer_;
+ handle_type elements_;
+};
+
+/** @brief Output stream support for compressed_matrix. Output format is same as MATLAB, Octave, or SciPy
+ *
+ * @param os STL output stream
+ * @param A The compressed matrix to be printed.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+std::ostream & operator<<(std::ostream & os, compressed_matrix<NumericT, AlignmentV> const & A)
+{
+ std::vector<std::map<unsigned int, NumericT> > tmp(A.size1());
+ viennacl::copy(A, tmp);
+ os << "compressed_matrix of size (" << A.size1() << ", " << A.size2() << ") with " << A.nnz() << " nonzeros:" << std::endl;
+
+ for (vcl_size_t i=0; i<A.size1(); ++i)
+ {
+ for (typename std::map<unsigned int, NumericT>::const_iterator it = tmp[i].begin(); it != tmp[i].end(); ++it)
+ os << " (" << i << ", " << it->first << ")\t" << it->second << std::endl;
+ }
+ return os;
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+ // x = A * y
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x = A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs = temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x += A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs += temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x -= A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs -= temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+ }
+ };
+
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const compressed_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs += temp_result;
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const compressed_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs -= temp_result;
+ }
+ };
+
+} // namespace detail
+} // namespace linalg
+ /** \endcond */
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/context.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/context.hpp b/native-viennaCL/src/main/cpp/viennacl/context.hpp
new file mode 100644
index 0000000..ed00c39
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/context.hpp
@@ -0,0 +1,88 @@
+#ifndef VIENNACL_CONTEXT_HPP_
+#define VIENNACL_CONTEXT_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/context.hpp
+ @brief Implementation of a OpenCL-like context, which serves as a unification of {OpenMP, CUDA, OpenCL} at the user API.
+*/
+
+#include <vector>
+#include <stddef.h>
+#include <assert.h>
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+
+namespace viennacl
+{
+/** @brief Represents a generic 'context' similar to an OpenCL context, but is backend-agnostic and thus also suitable for CUDA and OpenMP
+ *
+ * Context objects are used to distinguish between different memory domains. One context may refer to an OpenCL device, another context may refer to a CUDA device, and a third context to main RAM.
+ * Thus, operations are only defined on objects residing on the same context.
+ */
+class context
+{
+public:
+ context() : mem_type_(viennacl::backend::default_memory_type())
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ if (mem_type_ == OPENCL_MEMORY)
+ ocl_context_ptr_ = &viennacl::ocl::current_context();
+ else
+ ocl_context_ptr_ = NULL;
+#endif
+ }
+
+ explicit context(viennacl::memory_types mtype) : mem_type_(mtype)
+ {
+ if (mem_type_ == MEMORY_NOT_INITIALIZED)
+ mem_type_ = viennacl::backend::default_memory_type();
+#ifdef VIENNACL_WITH_OPENCL
+ if (mem_type_ == OPENCL_MEMORY)
+ ocl_context_ptr_ = &viennacl::ocl::current_context();
+ else
+ ocl_context_ptr_ = NULL;
+#endif
+ }
+
+#ifdef VIENNACL_WITH_OPENCL
+ context(viennacl::ocl::context const & ctx) : mem_type_(OPENCL_MEMORY), ocl_context_ptr_(&ctx) {}
+
+ viennacl::ocl::context const & opencl_context() const
+ {
+ assert(mem_type_ == OPENCL_MEMORY && bool("Context type is not OpenCL"));
+ return *ocl_context_ptr_;
+ }
+#endif
+
+ // TODO: Add CUDA and OpenMP contexts
+
+ viennacl::memory_types memory_type() const { return mem_type_; }
+
+private:
+ viennacl::memory_types mem_type_;
+#ifdef VIENNACL_WITH_OPENCL
+ viennacl::ocl::context const * ocl_context_ptr_;
+#endif
+};
+
+
+}
+
+#endif
[40/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
new file mode 100644
index 0000000..0b81203
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/amg.hpp
@@ -0,0 +1,398 @@
+#ifndef VIENNACL_LINALG_AMG_HPP_
+#define VIENNACL_LINALG_AMG_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/amg.hpp
+ @brief Main include file for algebraic multigrid (AMG) preconditioners. Experimental.
+
+ Implementation contributed by Markus Wagner
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/amg_operations.hpp"
+#include "viennacl/tools/timer.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/lu.hpp"
+
+#include <map>
+
+#ifdef VIENNACL_WITH_OPENMP
+ #include <omp.h>
+#endif
+
+#define VIENNACL_AMG_MAX_LEVELS 20
+
+namespace viennacl
+{
+namespace linalg
+{
+
+class amg_coarse_problem_too_large_exception : public std::runtime_error
+{
+public:
+ amg_coarse_problem_too_large_exception(std::string const & msg, vcl_size_t num_points) : std::runtime_error(msg), c_points_(num_points) {}
+
+ /** @brief Returns the number of coarse points for which no further coarsening could be applied */
+ vcl_size_t coarse_points() const { return c_points_; }
+
+private:
+ vcl_size_t c_points_;
+};
+
+
+namespace detail
+{
+ /** @brief Sparse Galerkin product: Calculates A_coarse = trans(P)*A_fine*P = R*A_fine*P
+ *
+ * @param A_fine Operator matrix on fine grid (quadratic)
+ * @param P Prolongation/Interpolation matrix
+ * @param R Restriction matrix
+ * @param A_coarse Result matrix on coarse grid (Galerkin operator)
+ */
+ template<typename NumericT>
+ void amg_galerkin_prod(compressed_matrix<NumericT> & A_fine,
+ compressed_matrix<NumericT> & P,
+ compressed_matrix<NumericT> & R, //P^T
+ compressed_matrix<NumericT> & A_coarse)
+ {
+
+ compressed_matrix<NumericT> A_fine_times_P(viennacl::traits::context(A_fine));
+
+ // transpose P in memory (no known way of efficiently multiplying P^T * B for CSR-matrices P and B):
+ viennacl::linalg::detail::amg::amg_transpose(P, R);
+
+ // compute Galerkin product using a temporary for the result of A_fine * P
+ A_fine_times_P = viennacl::linalg::prod(A_fine, P);
+ A_coarse = viennacl::linalg::prod(R, A_fine_times_P);
+
+ }
+
+
+ /** @brief Setup AMG preconditioner
+ *
+ * @param list_of_A Operator matrices on all levels
+ * @param list_of_P Prolongation/Interpolation operators on all levels
+ * @param list_of_R Restriction operators on all levels
+ * @param list_of_amg_level_context Auxiliary datastructures for managing the grid hierarchy (coarse nodes, etc.)
+ * @param tag AMG preconditioner tag
+ */
+ template<typename NumericT, typename AMGContextListT>
+ vcl_size_t amg_setup(std::vector<compressed_matrix<NumericT> > & list_of_A,
+ std::vector<compressed_matrix<NumericT> > & list_of_P,
+ std::vector<compressed_matrix<NumericT> > & list_of_R,
+ AMGContextListT & list_of_amg_level_context,
+ amg_tag & tag)
+ {
+ // Set number of iterations. If automatic coarse grid construction is chosen (0), then set a maximum size and stop during the process.
+ vcl_size_t iterations = tag.get_coarse_levels();
+ if (iterations == 0)
+ iterations = VIENNACL_AMG_MAX_LEVELS;
+
+ for (vcl_size_t i=0; i<iterations; ++i)
+ {
+ list_of_amg_level_context[i].switch_context(tag.get_setup_context());
+ list_of_amg_level_context[i].resize(list_of_A[i].size1(), list_of_A[i].nnz());
+
+ // Construct C and F points on coarse level (i is fine level, i+1 coarse level).
+ detail::amg::amg_coarse(list_of_A[i], list_of_amg_level_context[i], tag);
+
+ // Calculate number of C and F points on level i.
+ unsigned int c_points = list_of_amg_level_context[i].num_coarse_;
+ unsigned int f_points = static_cast<unsigned int>(list_of_A[i].size1()) - c_points;
+
+ if (f_points == 0 && c_points > tag.get_coarsening_cutoff())
+ {
+ std::stringstream ss;
+ ss << "No further coarsening possible (" << c_points << " coarse points). Consider changing the strong connection threshold or increasing the coarsening cutoff." << std::endl;
+ throw amg_coarse_problem_too_large_exception(ss.str(), c_points);
+ }
+
+ // Stop routine when the maximal coarse level is found (no C or F point). Coarsest level is level i.
+ if (c_points == 0 || f_points == 0)
+ break;
+
+ // Construct interpolation matrix for level i.
+ detail::amg::amg_interpol(list_of_A[i], list_of_P[i], list_of_amg_level_context[i], tag);
+
+ // Compute coarse grid operator (A[i+1] = R * A[i] * P) with R = trans(P).
+ amg_galerkin_prod(list_of_A[i], list_of_P[i], list_of_R[i], list_of_A[i+1]);
+
+ // send matrices to target context:
+ list_of_A[i].switch_memory_context(tag.get_target_context());
+ list_of_P[i].switch_memory_context(tag.get_target_context());
+ list_of_R[i].switch_memory_context(tag.get_target_context());
+
+ // If Limit of coarse points is reached then stop. Coarsest level is level i+1.
+ if (tag.get_coarse_levels() == 0 && c_points <= tag.get_coarsening_cutoff())
+ return i+1;
+ }
+
+ return iterations;
+ }
+
+
+ /** @brief Initialize AMG preconditioner
+ *
+ * @param mat System matrix
+ * @param list_of_A Operator matrices on all levels
+ * @param list_of_P Prolongation/Interpolation operators on all levels
+ * @param list_of_R Restriction operators on all levels
+ * @param list_of_amg_level_context Auxiliary datastructures for managing the grid hierarchy (coarse nodes, etc.)
+ * @param tag AMG preconditioner tag
+ */
+ template<typename MatrixT, typename InternalT1, typename InternalT2>
+ void amg_init(MatrixT const & mat, InternalT1 & list_of_A, InternalT1 & list_of_P, InternalT1 & list_of_R, InternalT2 & list_of_amg_level_context, amg_tag & tag)
+ {
+ typedef typename InternalT1::value_type SparseMatrixType;
+
+ vcl_size_t num_levels = (tag.get_coarse_levels() > 0) ? tag.get_coarse_levels() : VIENNACL_AMG_MAX_LEVELS;
+
+ list_of_A.resize(num_levels+1, SparseMatrixType(tag.get_setup_context()));
+ list_of_P.resize(num_levels, SparseMatrixType(tag.get_setup_context()));
+ list_of_R.resize(num_levels, SparseMatrixType(tag.get_setup_context()));
+ list_of_amg_level_context.resize(num_levels);
+
+ // Insert operator matrix as operator for finest level.
+ //SparseMatrixType A0(mat);
+ //A.insert_element(0, A0);
+ list_of_A[0].switch_memory_context(viennacl::traits::context(mat));
+ list_of_A[0] = mat;
+ list_of_A[0].switch_memory_context(tag.get_setup_context());
+ }
+
+ /** @brief Setup data structures for precondition phase for later use on the GPU
+ *
+ * @param result Result vector on all levels
+ * @param result_backup Copy of result vector on all levels
+ * @param rhs RHS vector on all levels
+ * @param residual Residual vector on all levels
+ * @param A Operators matrices on all levels from setup phase
+ * @param coarse_levels Number of coarse levels for which the datastructures should be set up.
+ * @param tag AMG preconditioner tag
+ */
+ template<typename InternalVectorT, typename SparseMatrixT>
+ void amg_setup_apply(InternalVectorT & result,
+ InternalVectorT & result_backup,
+ InternalVectorT & rhs,
+ InternalVectorT & residual,
+ SparseMatrixT const & A,
+ vcl_size_t coarse_levels,
+ amg_tag const & tag)
+ {
+ typedef typename InternalVectorT::value_type VectorType;
+
+ result.resize(coarse_levels + 1);
+ result_backup.resize(coarse_levels + 1);
+ rhs.resize(coarse_levels + 1);
+ residual.resize(coarse_levels);
+
+ for (vcl_size_t level=0; level <= coarse_levels; ++level)
+ {
+ result[level] = VectorType(A[level].size1(), tag.get_target_context());
+ result_backup[level] = VectorType(A[level].size1(), tag.get_target_context());
+ rhs[level] = VectorType(A[level].size1(), tag.get_target_context());
+ }
+ for (vcl_size_t level=0; level < coarse_levels; ++level)
+ {
+ residual[level] = VectorType(A[level].size1(), tag.get_target_context());
+ }
+ }
+
+
+ /** @brief Pre-compute LU factorization for direct solve (ublas library).
+ *
+ * Speeds up precondition phase as this is computed only once overall instead of once per iteration.
+ *
+ * @param op Operator matrix for direct solve
+ * @param A Operator matrix on coarsest level
+ * @param tag AMG preconditioner tag
+ */
+ template<typename NumericT, typename SparseMatrixT>
+ void amg_lu(viennacl::matrix<NumericT> & op,
+ SparseMatrixT const & A,
+ amg_tag const & tag)
+ {
+ op.switch_memory_context(tag.get_setup_context());
+ op.resize(A.size1(), A.size2(), false);
+ viennacl::linalg::detail::amg::assign_to_dense(A, op);
+
+ viennacl::linalg::lu_factorize(op);
+ op.switch_memory_context(tag.get_target_context());
+ }
+
+}
+
+/** @brief AMG preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class amg_precond;
+
+
+/** @brief AMG preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class amg_precond< compressed_matrix<NumericT, AlignmentV> >
+{
+ typedef viennacl::compressed_matrix<NumericT, AlignmentV> SparseMatrixType;
+ typedef viennacl::vector<NumericT> VectorType;
+ typedef detail::amg::amg_level_context AMGContextType;
+
+public:
+
+ amg_precond() {}
+
+ /** @brief The constructor. Builds data structures.
+ *
+ * @param mat System matrix
+ * @param tag The AMG tag
+ */
+ amg_precond(compressed_matrix<NumericT, AlignmentV> const & mat,
+ amg_tag const & tag)
+ {
+ tag_ = tag;
+
+ // Initialize data structures.
+ detail::amg_init(mat, A_list_, P_list_, R_list_, amg_context_list_, tag_);
+ }
+
+ /** @brief Start setup phase for this class and copy data structures.
+ */
+ void setup()
+ {
+ // Start setup phase.
+ vcl_size_t num_coarse_levels = detail::amg_setup(A_list_, P_list_, R_list_, amg_context_list_, tag_);
+
+ // Setup precondition phase (Data structures).
+ detail::amg_setup_apply(result_list_, result_backup_list_, rhs_list_, residual_list_, A_list_, num_coarse_levels, tag_);
+
+ // LU factorization for direct solve.
+ detail::amg_lu(coarsest_op_, A_list_[num_coarse_levels], tag_);
+ }
+
+
+ /** @brief Precondition Operation
+ *
+ * @param vec The vector to which preconditioning is applied to
+ */
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ vcl_size_t level;
+
+ // Precondition operation (Yang, p.3).
+ rhs_list_[0] = vec;
+
+ // Part 1: Restrict down to coarsest level
+ for (level=0; level < residual_list_.size(); level++)
+ {
+ result_list_[level].clear();
+
+ // Apply Smoother presmooth_ times.
+ viennacl::linalg::detail::amg::smooth_jacobi(static_cast<unsigned int>(tag_.get_presmooth_steps()),
+ A_list_[level],
+ result_list_[level],
+ result_backup_list_[level],
+ rhs_list_[level],
+ static_cast<NumericT>(tag_.get_jacobi_weight()));
+
+ // Compute residual.
+ //residual[level] = rhs_[level] - viennacl::linalg::prod(A_[level], result_[level]);
+ residual_list_[level] = viennacl::linalg::prod(A_list_[level], result_list_[level]);
+ residual_list_[level] = rhs_list_[level] - residual_list_[level];
+
+ // Restrict to coarse level. Result is RHS of coarse level equation.
+ //residual_coarse[level] = viennacl::linalg::prod(R[level],residual[level]);
+ rhs_list_[level+1] = viennacl::linalg::prod(R_list_[level], residual_list_[level]);
+ }
+
+ // Part 2: On highest level use direct solve to solve equation (on the CPU)
+ result_list_[level] = rhs_list_[level];
+ viennacl::linalg::lu_substitute(coarsest_op_, result_list_[level]);
+
+ // Part 3: Prolongation to finest level
+ for (int level2 = static_cast<int>(residual_list_.size()-1); level2 >= 0; level2--)
+ {
+ level = static_cast<vcl_size_t>(level2);
+
+ // Interpolate error to fine level and correct solution.
+ result_backup_list_[level] = viennacl::linalg::prod(P_list_[level], result_list_[level+1]);
+ result_list_[level] += result_backup_list_[level];
+
+ // Apply Smoother postsmooth_ times.
+ viennacl::linalg::detail::amg::smooth_jacobi(static_cast<unsigned int>(tag_.get_postsmooth_steps()),
+ A_list_[level],
+ result_list_[level],
+ result_backup_list_[level],
+ rhs_list_[level],
+ static_cast<NumericT>(tag_.get_jacobi_weight()));
+ }
+ vec = result_list_[0];
+ }
+
+ /** @brief Returns the total number of multigrid levels in the hierarchy including the finest level. */
+ vcl_size_t levels() const { return residual_list_.size(); }
+
+
+ /** @brief Returns the problem/operator size at the respective multigrid level
+ *
+ * @param level Index of the multigrid level. 0 is the finest level, levels() - 1 is the coarsest level.
+ */
+ vcl_size_t size(vcl_size_t level) const
+ {
+ assert(level < levels() && bool("Level index out of bounds!"));
+ return residual_list_[level].size();
+ }
+
+ /** @brief Returns the associated preconditioner tag containing the configuration for the multigrid preconditioner. */
+ amg_tag const & tag() const { return tag_; }
+
+private:
+ std::vector<SparseMatrixType> A_list_;
+ std::vector<SparseMatrixType> P_list_;
+ std::vector<SparseMatrixType> R_list_;
+ std::vector<AMGContextType> amg_context_list_;
+
+ viennacl::matrix<NumericT> coarsest_op_;
+
+ mutable std::vector<VectorType> result_list_;
+ mutable std::vector<VectorType> result_backup_list_;
+ mutable std::vector<VectorType> rhs_list_;
+ mutable std::vector<VectorType> residual_list_;
+
+ amg_tag tag_;
+};
+
+}
+}
+
+
+
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
new file mode 100644
index 0000000..9c7f79f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/amg_operations.hpp
@@ -0,0 +1,238 @@
+#ifndef VIENNACL_LINALG_AMG_OPERATIONS_HPP_
+#define VIENNACL_LINALG_AMG_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/amg_operations.hpp
+ @brief Implementations of operations for algebraic multigrid
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/host_based/amg_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+ #include "viennacl/linalg/opencl/amg_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ #include "viennacl/linalg/cuda/amg_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace amg
+{
+
+template<typename NumericT, typename AMGContextT>
+void amg_influence(compressed_matrix<NumericT> const & A, AMGContextT & amg_context, amg_tag & tag)
+{
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::amg::amg_influence(A, amg_context, tag);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::amg::amg_influence(A, amg_context, tag);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::amg::amg_influence(A, amg_context, tag);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+template<typename NumericT, typename AMGContextT>
+void amg_coarse(compressed_matrix<NumericT> const & A, AMGContextT & amg_context, amg_tag & tag)
+{
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::amg::amg_coarse(A, amg_context, tag);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::amg::amg_coarse(A, amg_context, tag);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::amg::amg_coarse(A, amg_context, tag);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+template<typename NumericT, typename AMGContextT>
+void amg_interpol(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ AMGContextT & amg_context,
+ amg_tag & tag)
+{
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::amg::amg_interpol(A, P, amg_context, tag);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::amg::amg_interpol(A, P, amg_context, tag);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::amg::amg_interpol(A, P, amg_context, tag);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+template<typename NumericT>
+void amg_transpose(compressed_matrix<NumericT> & A,
+ compressed_matrix<NumericT> & B)
+{
+ viennacl::context orig_ctx = viennacl::traits::context(A);
+ viennacl::context cpu_ctx(viennacl::MAIN_MEMORY);
+ (void)orig_ctx;
+ (void)cpu_ctx;
+
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::amg::amg_transpose(A, B);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ A.switch_memory_context(cpu_ctx);
+ B.switch_memory_context(cpu_ctx);
+ viennacl::linalg::host_based::amg::amg_transpose(A, B);
+ A.switch_memory_context(orig_ctx);
+ B.switch_memory_context(orig_ctx);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ A.switch_memory_context(cpu_ctx);
+ B.switch_memory_context(cpu_ctx);
+ viennacl::linalg::host_based::amg::amg_transpose(A, B);
+ A.switch_memory_context(orig_ctx);
+ B.switch_memory_context(orig_ctx);
+ //viennacl::linalg::cuda::amg_transpose(A, B);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/** Assign sparse matrix A to dense matrix B */
+template<typename SparseMatrixType, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
+assign_to_dense(SparseMatrixType const & A,
+ viennacl::matrix_base<NumericT> & B)
+{
+ assert( (A.size1() == B.size1()) && bool("Size check failed for assignment to dense matrix: size1(A) != size1(B)"));
+ assert( (A.size2() == B.size1()) && bool("Size check failed for assignment to dense matrix: size2(A) != size2(B)"));
+
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::amg::assign_to_dense(A, B);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::amg::assign_to_dense(A, B);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::amg::assign_to_dense(A, B);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+ compressed_matrix<NumericT> const & A,
+ vector<NumericT> & x,
+ vector<NumericT> & x_backup,
+ vector<NumericT> const & rhs_smooth,
+ NumericT weight)
+{
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::amg::smooth_jacobi(iterations, A, x, x_backup, rhs_smooth, weight);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+} //namespace amg
+} //namespace detail
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
new file mode 100644
index 0000000..57bc89a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bicgstab.hpp
@@ -0,0 +1,598 @@
+#ifndef VIENNACL_LINALG_BICGSTAB_HPP_
+#define VIENNACL_LINALG_BICGSTAB_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file bicgstab.hpp
+ @brief The stabilized bi-conjugate gradient method is implemented here
+*/
+
+#include <vector>
+#include <cmath>
+#include <numeric>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/linalg/iterative_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the stabilized Bi-conjugate gradient solver. Used for supplying solver parameters and for dispatching the solve() function
+*/
+class bicgstab_tag
+{
+public:
+ /** @brief The constructor
+ *
+ * @param tol Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+ * @param max_iters The maximum number of iterations
+ * @param max_iters_before_restart The maximum number of iterations before BiCGStab is reinitialized (to avoid accumulation of round-off errors)
+ */
+ bicgstab_tag(double tol = 1e-8, vcl_size_t max_iters = 400, vcl_size_t max_iters_before_restart = 200)
+ : tol_(tol), abs_tol_(0), iterations_(max_iters), iterations_before_restart_(max_iters_before_restart) {}
+
+ /** @brief Returns the relative tolerance */
+ double tolerance() const { return tol_; }
+
+ /** @brief Returns the absolute tolerance */
+ double abs_tolerance() const { return abs_tol_; }
+ /** @brief Sets the absolute tolerance */
+ void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
+
+ /** @brief Returns the maximum number of iterations */
+ vcl_size_t max_iterations() const { return iterations_; }
+ /** @brief Returns the maximum number of iterations before a restart*/
+ vcl_size_t max_iterations_before_restart() const { return iterations_before_restart_; }
+
+ /** @brief Return the number of solver iterations: */
+ vcl_size_t iters() const { return iters_taken_; }
+ void iters(vcl_size_t i) const { iters_taken_ = i; }
+
+ /** @brief Returns the estimated relative error at the end of the solver run */
+ double error() const { return last_error_; }
+ /** @brief Sets the estimated relative error at the end of the solver run */
+ void error(double e) const { last_error_ = e; }
+
+private:
+ double tol_;
+ double abs_tol_;
+ vcl_size_t iterations_;
+ vcl_size_t iterations_before_restart_;
+
+ //return values from solver
+ mutable vcl_size_t iters_taken_;
+ mutable double last_error_;
+};
+
+
+
+namespace detail
+{
+ /** @brief Implementation of a pipelined stabilized Bi-conjugate gradient solver */
+ template<typename MatrixT, typename NumericT>
+ viennacl::vector<NumericT> pipelined_solve(MatrixT const & A, //MatrixType const & A,
+ viennacl::vector_base<NumericT> const & rhs,
+ bicgstab_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ viennacl::vector<NumericT> result = viennacl::zero_vector<NumericT>(rhs.size(), viennacl::traits::context(rhs));
+
+ viennacl::vector<NumericT> residual = rhs;
+ viennacl::vector<NumericT> p = rhs;
+ viennacl::vector<NumericT> r0star = rhs;
+ viennacl::vector<NumericT> Ap = rhs;
+ viennacl::vector<NumericT> s = rhs;
+ viennacl::vector<NumericT> As = rhs;
+
+ // Layout of temporary buffer:
+ // chunk 0: <residual, r_0^*>
+ // chunk 1: <As, As>
+ // chunk 2: <As, s>
+ // chunk 3: <Ap, r_0^*>
+ // chunk 4: <As, r_0^*>
+ // chunk 5: <s, s>
+ vcl_size_t buffer_size_per_vector = 256;
+ vcl_size_t num_buffer_chunks = 6;
+ viennacl::vector<NumericT> inner_prod_buffer = viennacl::zero_vector<NumericT>(num_buffer_chunks*buffer_size_per_vector, viennacl::traits::context(rhs)); // temporary buffer
+ std::vector<NumericT> host_inner_prod_buffer(inner_prod_buffer.size());
+
+ NumericT norm_rhs_host = viennacl::linalg::norm_2(residual);
+ NumericT beta;
+ NumericT alpha;
+ NumericT omega;
+ NumericT residual_norm = norm_rhs_host;
+ inner_prod_buffer[0] = norm_rhs_host * norm_rhs_host;
+
+ NumericT r_dot_r0 = 0;
+ NumericT As_dot_As = 0;
+ NumericT As_dot_s = 0;
+ NumericT Ap_dot_r0 = 0;
+ NumericT As_dot_r0 = 0;
+ NumericT s_dot_s = 0;
+
+ if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+ return result;
+
+ for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
+ {
+ tag.iters(i+1);
+ // Ap = A*p_j
+ // Ap_dot_r0 = <Ap, r_0^*>
+ viennacl::linalg::pipelined_bicgstab_prod(A, p, Ap, r0star,
+ inner_prod_buffer, buffer_size_per_vector, 3*buffer_size_per_vector);
+
+ //////// first (weak) synchronization point ////
+
+ ///// method 1: compute alpha on host:
+ //
+ //// we only need the second chunk of the buffer for computing Ap_dot_r0:
+ //viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+ //Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + buffer_size_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, ScalarType(0));
+
+ //alpha = residual_dot_r0 / Ap_dot_r0;
+
+ //// s_j = r_j - alpha_j q_j
+ //s = residual - alpha * Ap;
+
+ ///// method 2: compute alpha on device:
+ // s = r - alpha * Ap
+ // <s, s> first stage
+ // dump alpha at end of inner_prod_buffer
+ viennacl::linalg::pipelined_bicgstab_update_s(s, residual, Ap,
+ inner_prod_buffer, buffer_size_per_vector, 5*buffer_size_per_vector);
+
+ // As = A*s_j
+ // As_dot_As = <As, As>
+ // As_dot_s = <As, s>
+ // As_dot_r0 = <As, r_0^*>
+ viennacl::linalg::pipelined_bicgstab_prod(A, s, As, r0star,
+ inner_prod_buffer, buffer_size_per_vector, 4*buffer_size_per_vector);
+
+ //////// second (strong) synchronization point ////
+
+ viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+
+ typedef typename std::vector<NumericT>::difference_type difference_type;
+
+ r_dot_r0 = std::accumulate(host_inner_prod_buffer.begin(), host_inner_prod_buffer.begin() + difference_type( buffer_size_per_vector), NumericT(0));
+ As_dot_As = std::accumulate(host_inner_prod_buffer.begin() + difference_type( buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(2 * buffer_size_per_vector), NumericT(0));
+ As_dot_s = std::accumulate(host_inner_prod_buffer.begin() + difference_type(2 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(3 * buffer_size_per_vector), NumericT(0));
+ Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + difference_type(3 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(4 * buffer_size_per_vector), NumericT(0));
+ As_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + difference_type(4 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(5 * buffer_size_per_vector), NumericT(0));
+ s_dot_s = std::accumulate(host_inner_prod_buffer.begin() + difference_type(5 * buffer_size_per_vector), host_inner_prod_buffer.begin() + difference_type(6 * buffer_size_per_vector), NumericT(0));
+
+ alpha = r_dot_r0 / Ap_dot_r0;
+ beta = - As_dot_r0 / Ap_dot_r0;
+ omega = As_dot_s / As_dot_As;
+
+ residual_norm = std::sqrt(s_dot_s - NumericT(2.0) * omega * As_dot_s + omega * omega * As_dot_As);
+ if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+ break;
+ if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance() || residual_norm < tag.abs_tolerance())
+ break;
+
+ // x_{j+1} = x_j + alpha * p_j + omega * s_j
+ // r_{j+1} = s_j - omega * t_j
+ // p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
+ // and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
+ viennacl::linalg::pipelined_bicgstab_vector_update(result, alpha, p, omega, s,
+ residual, As,
+ beta, Ap,
+ r0star, inner_prod_buffer, buffer_size_per_vector);
+ }
+
+ //store last error estimate:
+ tag.error(residual_norm / norm_rhs_host);
+
+ return result;
+ }
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ bicgstab_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ bicgstab_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ bicgstab_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ bicgstab_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ bicgstab_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ /** @brief Implementation of the unpreconditioned stabilized Bi-conjugate gradient solver
+ *
+ * Following the description in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+ *
+ * @param matrix The system matrix
+ * @param rhs The load vector
+ * @param tag Solver configuration tag
+ * @param monitor A callback routine which is called at each GMRES restart
+ * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+ * @return The result vector
+ */
+ template<typename MatrixT, typename VectorT>
+ VectorT solve_impl(MatrixT const & matrix,
+ VectorT const & rhs,
+ bicgstab_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ typedef typename viennacl::result_of::value_type<VectorT>::type NumericType;
+ typedef typename viennacl::result_of::cpu_value_type<NumericType>::type CPU_NumericType;
+ VectorT result = rhs;
+ viennacl::traits::clear(result);
+
+ VectorT residual = rhs;
+ VectorT p = rhs;
+ VectorT r0star = rhs;
+ VectorT tmp0 = rhs;
+ VectorT tmp1 = rhs;
+ VectorT s = rhs;
+
+ CPU_NumericType norm_rhs_host = viennacl::linalg::norm_2(residual);
+ CPU_NumericType ip_rr0star = norm_rhs_host * norm_rhs_host;
+ CPU_NumericType beta;
+ CPU_NumericType alpha;
+ CPU_NumericType omega;
+ //ScalarType inner_prod_temp; //temporary variable for inner product computation
+ CPU_NumericType new_ip_rr0star = 0;
+ CPU_NumericType residual_norm = norm_rhs_host;
+
+ if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+ return result;
+
+ bool restart_flag = true;
+ vcl_size_t last_restart = 0;
+ for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
+ {
+ if (restart_flag)
+ {
+ residual = viennacl::linalg::prod(matrix, result);
+ residual = rhs - residual;
+ p = residual;
+ r0star = residual;
+ ip_rr0star = viennacl::linalg::norm_2(residual);
+ ip_rr0star *= ip_rr0star;
+ restart_flag = false;
+ last_restart = i;
+ }
+
+ tag.iters(i+1);
+ tmp0 = viennacl::linalg::prod(matrix, p);
+ alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
+
+ s = residual - alpha*tmp0;
+
+ tmp1 = viennacl::linalg::prod(matrix, s);
+ CPU_NumericType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+ omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+
+ result += alpha * p + omega * s;
+ residual = s - omega * tmp1;
+
+ new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+ residual_norm = viennacl::linalg::norm_2(residual);
+ if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+ break;
+ if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance() || residual_norm < tag.abs_tolerance())
+ break;
+
+ beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+ ip_rr0star = new_ip_rr0star;
+
+ if ( (ip_rr0star <= 0 && ip_rr0star >= 0)
+ || (omega <= 0 && omega >= 0)
+ || (i - last_restart > tag.max_iterations_before_restart())
+ ) //search direction degenerate. A restart might help
+ restart_flag = true;
+
+ // Execution of
+ // p = residual + beta * (p - omega*tmp0);
+ // without introducing temporary vectors:
+ p -= omega * tmp0;
+ p = residual + beta * p;
+ }
+
+ //store last error estimate:
+ tag.error(residual_norm / norm_rhs_host);
+
+ return result;
+ }
+
+
+ /** @brief Implementation of the preconditioned stabilized Bi-conjugate gradient solver
+ *
+ * Following the description of the unpreconditioned case in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+ *
+ * @param matrix The system matrix
+ * @param rhs The load vector
+ * @param tag Solver configuration tag
+ * @param precond A preconditioner. Precondition operation is done via member function apply()
+ * @param monitor A callback routine which is called at each GMRES restart
+ * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+ * @return The result vector
+ */
+ template<typename MatrixT, typename VectorT, typename PreconditionerT>
+ VectorT solve_impl(MatrixT const & matrix,
+ VectorT const & rhs,
+ bicgstab_tag const & tag,
+ PreconditionerT const & precond,
+ bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ typedef typename viennacl::result_of::value_type<VectorT>::type NumericType;
+ typedef typename viennacl::result_of::cpu_value_type<NumericType>::type CPU_NumericType;
+ VectorT result = rhs;
+ viennacl::traits::clear(result);
+
+ VectorT residual = rhs;
+ VectorT r0star = residual; //can be chosen arbitrarily in fact
+ VectorT tmp0 = rhs;
+ VectorT tmp1 = rhs;
+ VectorT s = rhs;
+
+ VectorT p = residual;
+
+ CPU_NumericType ip_rr0star = viennacl::linalg::norm_2(residual);
+ CPU_NumericType norm_rhs_host = viennacl::linalg::norm_2(residual);
+ CPU_NumericType beta;
+ CPU_NumericType alpha;
+ CPU_NumericType omega;
+ CPU_NumericType new_ip_rr0star = 0;
+ CPU_NumericType residual_norm = norm_rhs_host;
+
+ if (norm_rhs_host <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+ return result;
+
+ bool restart_flag = true;
+ vcl_size_t last_restart = 0;
+ for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+ {
+ if (restart_flag)
+ {
+ residual = viennacl::linalg::prod(matrix, result);
+ residual = rhs - residual;
+ precond.apply(residual);
+ p = residual;
+ r0star = residual;
+ ip_rr0star = viennacl::linalg::norm_2(residual);
+ ip_rr0star *= ip_rr0star;
+ restart_flag = false;
+ last_restart = i;
+ }
+
+ tag.iters(i+1);
+ tmp0 = viennacl::linalg::prod(matrix, p);
+ precond.apply(tmp0);
+ alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star);
+
+ s = residual - alpha*tmp0;
+
+ tmp1 = viennacl::linalg::prod(matrix, s);
+ precond.apply(tmp1);
+ CPU_NumericType norm_tmp1 = viennacl::linalg::norm_2(tmp1);
+ omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1);
+
+ result += alpha * p + omega * s;
+ residual = s - omega * tmp1;
+
+ residual_norm = viennacl::linalg::norm_2(residual);
+ if (monitor && monitor(result, std::fabs(residual_norm / norm_rhs_host), monitor_data))
+ break;
+ if (residual_norm / norm_rhs_host < tag.tolerance() || residual_norm < tag.abs_tolerance())
+ break;
+
+ new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);
+
+ beta = new_ip_rr0star / ip_rr0star * alpha/omega;
+ ip_rr0star = new_ip_rr0star;
+
+ if ( (ip_rr0star >= 0 && ip_rr0star <= 0) || (omega >=0 && omega <= 0) || i - last_restart > tag.max_iterations_before_restart()) //search direction degenerate. A restart might help
+ restart_flag = true;
+
+ // Execution of
+ // p = residual + beta * (p - omega*tmp0);
+ // without introducing temporary vectors:
+ p -= omega * tmp0;
+ p = residual + beta * p;
+
+ //std::cout << "Rel. Residual in current step: " << std::sqrt(std::fabs(viennacl::linalg::inner_prod(residual, residual) / norm_rhs_host)) << std::endl;
+ }
+
+ //store last error estimate:
+ tag.error(residual_norm / norm_rhs_host);
+
+ return result;
+ }
+
+}
+
+
+
+template<typename MatrixT, typename VectorT, typename PreconditionerT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, bicgstab_tag const & tag, PreconditionerT const & precond)
+{
+ return detail::solve_impl(matrix, rhs, tag, precond);
+}
+
+
+/** @brief Convenience overload for calling the preconditioned BiCGStab solver using types from the C++ STL.
+ *
+ * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly.
+ * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out.
+ */
+template<typename IndexT, typename NumericT, typename PreconditionerT>
+std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, bicgstab_tag const & tag, PreconditionerT const & precond)
+{
+ viennacl::compressed_matrix<NumericT> vcl_A;
+ viennacl::copy(A, vcl_A);
+
+ viennacl::vector<NumericT> vcl_rhs(rhs.size());
+ viennacl::copy(rhs, vcl_rhs);
+
+ viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
+
+ std::vector<NumericT> result(vcl_result.size());
+ viennacl::copy(vcl_result, result);
+ return result;
+}
+
+/** @brief Entry point for the unpreconditioned BiCGStab method.
+ *
+ * @param matrix The system matrix
+ * @param rhs Right hand side vector (load vector)
+ * @param tag A BiCGStab tag providing relative tolerances, etc.
+ */
+template<typename MatrixT, typename VectorT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, bicgstab_tag const & tag)
+{
+ return solve(matrix, rhs, tag, viennacl::linalg::no_precond());
+}
+
+
+
+template<typename VectorT>
+class bicgstab_solver
+{
+public:
+ typedef typename viennacl::result_of::cpu_value_type<VectorT>::type numeric_type;
+
+ bicgstab_solver(bicgstab_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {}
+
+ template<typename MatrixT, typename PreconditionerT>
+ VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const
+ {
+ if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account
+ {
+ VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
+ mod_rhs = b - mod_rhs;
+ VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_);
+ return init_guess_ + y;
+ }
+ return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_);
+ }
+
+
+ template<typename MatrixT>
+ VectorT operator()(MatrixT const & A, VectorT const & b) const
+ {
+ return operator()(A, b, viennacl::linalg::no_precond());
+ }
+
+ /** @brief Specifies an initial guess for the iterative solver.
+ *
+ * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
+ */
+ void set_initial_guess(VectorT const & x) { init_guess_ = x; }
+
+ /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor.
+ *
+ * The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument.
+ * The third argument is a pointer to user-defined data, through which additional information can be passed.
+ * This pointer needs to be set with set_monitor_data. If not set, NULL is passed.
+ * If the montior function returns true, the solver terminates (either convergence or divergence).
+ */
+ void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data)
+ {
+ monitor_callback_ = monitor_fun;
+ user_data_ = user_data;
+ }
+
+ /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */
+ bicgstab_tag const & tag() const { return tag_; }
+
+private:
+ bicgstab_tag tag_;
+ VectorT init_guess_;
+ bool (*monitor_callback_)(VectorT const &, numeric_type, void *);
+ void *user_data_;
+};
+
+
+}
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
new file mode 100644
index 0000000..a2daf5e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect.hpp
@@ -0,0 +1,179 @@
+#ifndef VIENNACL_LINALG_BISECT_HPP_
+#define VIENNACL_LINALG_BISECT_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/bisect.hpp
+* @brief Implementation of the algorithm for finding eigenvalues of a tridiagonal matrix.
+*
+* Contributed by Guenther Mader and Astrid Rupp.
+*/
+
+#include <vector>
+#include <cmath>
+#include <limits>
+#include <cstddef>
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+namespace detail
+{
+ /**
+ * @brief overloaded function for copying vectors
+ */
+ template<typename NumericT, typename OtherVectorT>
+ void copy_vec_to_vec(viennacl::vector<NumericT> const & src, OtherVectorT & dest)
+ {
+ viennacl::copy(src, dest);
+ }
+
+ template<typename OtherVectorT, typename NumericT>
+ void copy_vec_to_vec(OtherVectorT const & src, viennacl::vector<NumericT> & dest)
+ {
+ viennacl::copy(src, dest);
+ }
+
+ template<typename VectorT1, typename VectorT2>
+ void copy_vec_to_vec(VectorT1 const & src, VectorT2 & dest)
+ {
+ for (vcl_size_t i=0; i<src.size(); ++i)
+ dest[i] = src[i];
+ }
+
+} //namespace detail
+
+/**
+* @brief Implementation of the bisect-algorithm for the calculation of the eigenvalues of a tridiagonal matrix. Experimental - interface might change.
+*
+* Refer to "Calculation of the Eigenvalues of a Symmetric Tridiagonal Matrix by the Method of Bisection" in the Handbook Series Linear Algebra, contributed by Barth, Martin, and Wilkinson.
+* http://www.maths.ed.ac.uk/~aar/papers/bamawi.pdf
+*
+* @param alphas Elements of the main diagonal
+* @param betas Elements of the secondary diagonal
+* @return Returns the eigenvalues of the tridiagonal matrix defined by alpha and beta
+*/
+template<typename VectorT>
+std::vector<
+ typename viennacl::result_of::cpu_value_type<typename VectorT::value_type>::type
+ >
+bisect(VectorT const & alphas, VectorT const & betas)
+{
+ typedef typename viennacl::result_of::value_type<VectorT>::type NumericType;
+ typedef typename viennacl::result_of::cpu_value_type<NumericType>::type CPU_NumericType;
+
+ vcl_size_t size = betas.size();
+ std::vector<CPU_NumericType> x_temp(size);
+
+
+ std::vector<CPU_NumericType> beta_bisect;
+ std::vector<CPU_NumericType> wu;
+
+ double rel_error = std::numeric_limits<CPU_NumericType>::epsilon();
+ beta_bisect.push_back(0);
+
+ for (vcl_size_t i = 1; i < size; i++)
+ beta_bisect.push_back(betas[i] * betas[i]);
+
+ double xmin = alphas[size - 1] - std::fabs(betas[size - 1]);
+ double xmax = alphas[size - 1] + std::fabs(betas[size - 1]);
+
+ for (vcl_size_t i = 0; i < size - 1; i++)
+ {
+ double h = std::fabs(betas[i]) + std::fabs(betas[i + 1]);
+ if (alphas[i] + h > xmax)
+ xmax = alphas[i] + h;
+ if (alphas[i] - h < xmin)
+ xmin = alphas[i] - h;
+ }
+
+
+ double eps1 = 1e-6;
+ /*double eps2 = (xmin + xmax > 0) ? (rel_error * xmax) : (-rel_error * xmin);
+ if (eps1 <= 0)
+ eps1 = eps2;
+ else
+ eps2 = 0.5 * eps1 + 7.0 * eps2; */
+
+ double x0 = xmax;
+
+ for (vcl_size_t i = 0; i < size; i++)
+ {
+ x_temp[i] = xmax;
+ wu.push_back(xmin);
+ }
+
+ for (long k = static_cast<long>(size) - 1; k >= 0; --k)
+ {
+ double xu = xmin;
+ for (long i = k; i >= 0; --i)
+ {
+ if (xu < wu[vcl_size_t(k-i)])
+ {
+ xu = wu[vcl_size_t(i)];
+ break;
+ }
+ }
+
+ if (x0 > x_temp[vcl_size_t(k)])
+ x0 = x_temp[vcl_size_t(k)];
+
+ double x1 = (xu + x0) / 2.0;
+ while (x0 - xu > 2.0 * rel_error * (std::fabs(xu) + std::fabs(x0)) + eps1)
+ {
+ vcl_size_t a = 0;
+ double q = 1;
+ for (vcl_size_t i = 0; i < size; i++)
+ {
+ if (q > 0 || q < 0)
+ q = alphas[i] - x1 - beta_bisect[i] / q;
+ else
+ q = alphas[i] - x1 - std::fabs(betas[i] / rel_error);
+
+ if (q < 0)
+ a++;
+ }
+
+ if (a <= static_cast<vcl_size_t>(k))
+ {
+ xu = x1;
+ if (a < 1)
+ wu[0] = x1;
+ else
+ {
+ wu[a] = x1;
+ if (x_temp[a - 1] > x1)
+ x_temp[a - 1] = x1;
+ }
+ }
+ else
+ x0 = x1;
+
+ x1 = (xu + x0) / 2.0;
+ }
+ x_temp[vcl_size_t(k)] = x1;
+ }
+ return x_temp;
+}
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
new file mode 100644
index 0000000..6918b14
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/bisect_gpu.hpp
@@ -0,0 +1,173 @@
+#ifndef VIENNACL_LINALG_BISECT_GPU
+#define VIENNACL_LINALG_BISECT_GPU
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/bisect_gpu.hpp
+ @brief Implementation of an bisection algorithm for eigenvalues
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/gerschgorin.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_large.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_small.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+///////////////////////////////////////////////////////////////////////////
+//! @brief bisect The bisection algorithm computes the eigevalues
+//! of a symmetric tridiagonal matrix.
+//! @param diagonal diagonal elements of the matrix
+//! @param superdiagonal superdiagonal elements of the matrix
+//! @param eigenvalues Vectors with the eigenvalues in ascending order
+//! @return return false if any errors occured
+///
+//! overloaded function template: std::vectors as parameters
+template<typename NumericT>
+bool
+bisect(const std::vector<NumericT> & diagonal, const std::vector<NumericT> & superdiagonal, std::vector<NumericT> & eigenvalues)
+{
+ assert(diagonal.size() == superdiagonal.size() &&
+ diagonal.size() == eigenvalues.size() &&
+ bool("Input vectors do not have the same sizes!"));
+ bool bResult = false;
+ // flag if the matrix size is due to explicit user request
+ // desired precision of eigenvalues
+ NumericT precision = static_cast<NumericT>(0.00001);
+ const unsigned int mat_size = static_cast<unsigned int>(diagonal.size());
+
+ // set up input
+ viennacl::linalg::detail::InputData<NumericT> input(diagonal, superdiagonal, mat_size);
+
+ NumericT lg = FLT_MAX;
+ NumericT ug = -FLT_MAX;
+ // compute Gerschgorin interval
+ viennacl::linalg::detail::computeGerschgorin(input.std_a, input.std_b, mat_size, lg, ug);
+
+ // decide wheter the algorithm for small or for large matrices will be started
+ if (mat_size <= VIENNACL_BISECT_MAX_SMALL_MATRIX)
+ {
+ // initialize memory for result
+ viennacl::linalg::detail::ResultDataSmall<NumericT> result(mat_size);
+
+ // run the kernel
+ viennacl::linalg::detail::computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision);
+
+ // get the result from the device and do some sanity checks,
+ viennacl::linalg::detail::processResultSmallMatrix(result, mat_size);
+ eigenvalues = result.std_eigenvalues;
+ bResult = true;
+ }
+
+ else
+ {
+ // initialize memory for result
+ viennacl::linalg::detail::ResultDataLarge<NumericT> result(mat_size);
+
+ // run the kernel
+ viennacl::linalg::detail::computeEigenvaluesLargeMatrix(input, result, mat_size, lg, ug, precision);
+
+ // get the result from the device and do some sanity checks
+ bResult = viennacl::linalg::detail::processResultDataLargeMatrix(result, mat_size);
+
+ eigenvalues = result.std_eigenvalues;
+ }
+ return bResult;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+//! @brief bisect The bisection algorithm computes the eigevalues
+//! of a symmetric tridiagonal matrix.
+//! @param diagonal diagonal elements of the matrix
+//! @param superdiagonal superdiagonal elements of the matrix
+//! @param eigenvalues Vectors with the eigenvalues in ascending order
+//! @return return false if any errors occured
+///
+//! overloaded function template: viennacl::vectors as parameters
+template<typename NumericT>
+bool
+bisect(const viennacl::vector<NumericT> & diagonal, const viennacl::vector<NumericT> & superdiagonal, viennacl::vector<NumericT> & eigenvalues)
+{
+ assert(diagonal.size() == superdiagonal.size() &&
+ diagonal.size() == eigenvalues.size() &&
+ bool("Input vectors do not have the same sizes!"));
+ bool bResult = false;
+ // flag if the matrix size is due to explicit user request
+ // desired precision of eigenvalues
+ NumericT precision = static_cast<NumericT>(0.00001);
+ const unsigned int mat_size = static_cast<unsigned int>(diagonal.size());
+
+ // set up input
+ viennacl::linalg::detail::InputData<NumericT> input(diagonal, superdiagonal, mat_size);
+
+ NumericT lg = FLT_MAX;
+ NumericT ug = -FLT_MAX;
+ // compute Gerschgorin interval
+ viennacl::linalg::detail::computeGerschgorin(input.std_a, input.std_b, mat_size, lg, ug);
+
+ // decide wheter the algorithm for small or for large matrices will be started
+ if (mat_size <= VIENNACL_BISECT_MAX_SMALL_MATRIX)
+ {
+ // initialize memory for result
+ viennacl::linalg::detail::ResultDataSmall<NumericT> result(mat_size);
+
+ // run the kernel
+ viennacl::linalg::detail::computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision);
+
+ // get the result from the device and do some sanity checks,
+ viennacl::linalg::detail::processResultSmallMatrix(result, mat_size);
+ copy(result.std_eigenvalues, eigenvalues);
+ bResult = true;
+ }
+
+ else
+ {
+ // initialize memory for result
+ viennacl::linalg::detail::ResultDataLarge<NumericT> result(mat_size);
+
+ // run the kernel
+ viennacl::linalg::detail::computeEigenvaluesLargeMatrix(input, result, mat_size, lg, ug, precision);
+
+ // get the result from the device and do some sanity checks
+ bResult = viennacl::linalg::detail::processResultDataLargeMatrix(result, mat_size);
+
+ copy(result.std_eigenvalues, eigenvalues);
+ }
+ return bResult;
+}
+} // namespace linalg
+} // namespace viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
new file mode 100644
index 0000000..93aae81
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cg.hpp
@@ -0,0 +1,440 @@
+#ifndef VIENNACL_LINALG_CG_HPP_
+#define VIENNACL_LINALG_CG_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cg.hpp
+ @brief The conjugate gradient method is implemented here
+*/
+
+#include <vector>
+#include <map>
+#include <cmath>
+#include <numeric>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/linalg/iterative_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the conjugate gradient Used for supplying solver parameters and for dispatching the solve() function
+*/
+class cg_tag
+{
+public:
+ /** @brief The constructor
+ *
+ * @param tol Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+ * @param max_iterations The maximum number of iterations
+ */
+ cg_tag(double tol = 1e-8, unsigned int max_iterations = 300) : tol_(tol), abs_tol_(0), iterations_(max_iterations) {}
+
+ /** @brief Returns the relative tolerance */
+ double tolerance() const { return tol_; }
+
+ /** @brief Returns the absolute tolerance */
+ double abs_tolerance() const { return abs_tol_; }
+ /** @brief Sets the absolute tolerance */
+ void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
+
+ /** @brief Returns the maximum number of iterations */
+ unsigned int max_iterations() const { return iterations_; }
+
+ /** @brief Return the number of solver iterations: */
+ unsigned int iters() const { return iters_taken_; }
+ void iters(unsigned int i) const { iters_taken_ = i; }
+
+ /** @brief Returns the estimated relative error at the end of the solver run */
+ double error() const { return last_error_; }
+ /** @brief Sets the estimated relative error at the end of the solver run */
+ void error(double e) const { last_error_ = e; }
+
+
+private:
+ double tol_;
+ double abs_tol_;
+ unsigned int iterations_;
+
+ //return values from solver
+ mutable unsigned int iters_taken_;
+ mutable double last_error_;
+};
+
+namespace detail
+{
+
+ /** @brief handles the no_precond case at minimal overhead */
+ template<typename VectorT, typename PreconditionerT>
+ class z_handler{
+ public:
+ z_handler(VectorT & residual) : z_(residual){ }
+ VectorT & get() { return z_; }
+ private:
+ VectorT z_;
+ };
+
+ template<typename VectorT>
+ class z_handler<VectorT, viennacl::linalg::no_precond>{
+ public:
+ z_handler(VectorT & residual) : presidual_(&residual){ }
+ VectorT & get() { return *presidual_; }
+ private:
+ VectorT * presidual_;
+ };
+
+}
+
+namespace detail
+{
+
+ /** @brief Implementation of a pipelined conjugate gradient algorithm (no preconditioner), specialized for ViennaCL types.
+ *
+ * Pipelined version from A. T. Chronopoulos and C. W. Gear, J. Comput. Appl. Math. 25(2), 153\u2013168 (1989)
+ *
+ * @param A The system matrix
+ * @param rhs The load vector
+ * @param tag Solver configuration tag
+ * @param monitor A callback routine which is called at each GMRES restart
+ * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+ * @return The result vector
+ */
+ //template<typename MatrixType, typename ScalarType>
+ template<typename MatrixT, typename NumericT>
+ viennacl::vector<NumericT> pipelined_solve(MatrixT const & A, //MatrixType const & A,
+ viennacl::vector<NumericT> const & rhs,
+ cg_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ typedef typename viennacl::vector<NumericT>::difference_type difference_type;
+
+ viennacl::vector<NumericT> result(rhs);
+ viennacl::traits::clear(result);
+
+ viennacl::vector<NumericT> residual(rhs);
+ viennacl::vector<NumericT> p(rhs);
+ viennacl::vector<NumericT> Ap = viennacl::linalg::prod(A, p);
+ viennacl::vector<NumericT> inner_prod_buffer = viennacl::zero_vector<NumericT>(3*256, viennacl::traits::context(rhs)); // temporary buffer
+ std::vector<NumericT> host_inner_prod_buffer(inner_prod_buffer.size());
+ vcl_size_t buffer_size_per_vector = inner_prod_buffer.size() / 3;
+ difference_type buffer_offset_per_vector = static_cast<difference_type>(buffer_size_per_vector);
+
+ NumericT norm_rhs_squared = viennacl::linalg::norm_2(residual); norm_rhs_squared *= norm_rhs_squared;
+
+ if (norm_rhs_squared <= tag.abs_tolerance() * tag.abs_tolerance()) //check for early convergence of A*x = 0
+ return result;
+
+ NumericT inner_prod_rr = norm_rhs_squared;
+ NumericT alpha = inner_prod_rr / viennacl::linalg::inner_prod(p, Ap);
+ NumericT beta = viennacl::linalg::norm_2(Ap); beta = (alpha * alpha * beta * beta - inner_prod_rr) / inner_prod_rr;
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+
+ for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+ {
+ tag.iters(i+1);
+
+ viennacl::linalg::pipelined_cg_vector_update(result, alpha, p, residual, Ap, beta, inner_prod_buffer);
+ viennacl::linalg::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+
+ // bring back the partial results to the host:
+ viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
+
+ inner_prod_rr = std::accumulate(host_inner_prod_buffer.begin(), host_inner_prod_buffer.begin() + buffer_offset_per_vector, NumericT(0));
+ inner_prod_ApAp = std::accumulate(host_inner_prod_buffer.begin() + buffer_offset_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_offset_per_vector, NumericT(0));
+ inner_prod_pAp = std::accumulate(host_inner_prod_buffer.begin() + 2 * buffer_offset_per_vector, host_inner_prod_buffer.begin() + 3 * buffer_offset_per_vector, NumericT(0));
+
+ if (monitor && monitor(result, std::sqrt(std::fabs(inner_prod_rr / norm_rhs_squared)), monitor_data))
+ break;
+ if (std::fabs(inner_prod_rr / norm_rhs_squared) < tag.tolerance() * tag.tolerance() || std::fabs(inner_prod_rr) < tag.abs_tolerance() * tag.abs_tolerance()) //squared norms involved here
+ break;
+
+ alpha = inner_prod_rr / inner_prod_pAp;
+ beta = (alpha*alpha*inner_prod_ApAp - inner_prod_rr) / inner_prod_rr;
+ }
+
+ //store last error estimate:
+ tag.error(std::sqrt(std::fabs(inner_prod_rr) / norm_rhs_squared));
+
+ return result;
+ }
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ cg_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ cg_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ cg_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ cg_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ cg_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ template<typename MatrixT, typename VectorT, typename PreconditionerT>
+ VectorT solve_impl(MatrixT const & matrix,
+ VectorT const & rhs,
+ cg_tag const & tag,
+ PreconditionerT const & precond,
+ bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ typedef typename viennacl::result_of::value_type<VectorT>::type NumericType;
+ typedef typename viennacl::result_of::cpu_value_type<NumericType>::type CPU_NumericType;
+
+ VectorT result = rhs;
+ viennacl::traits::clear(result);
+
+ VectorT residual = rhs;
+ VectorT tmp = rhs;
+ detail::z_handler<VectorT, PreconditionerT> zhandler(residual);
+ VectorT & z = zhandler.get();
+
+ precond.apply(z);
+ VectorT p = z;
+
+ CPU_NumericType ip_rr = viennacl::linalg::inner_prod(residual, z);
+ CPU_NumericType alpha;
+ CPU_NumericType new_ip_rr = 0;
+ CPU_NumericType beta;
+ CPU_NumericType norm_rhs_squared = ip_rr;
+ CPU_NumericType new_ipp_rr_over_norm_rhs;
+
+ if (norm_rhs_squared <= tag.abs_tolerance() * tag.abs_tolerance()) //solution is zero if RHS norm (squared) is zero
+ return result;
+
+ for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+ {
+ tag.iters(i+1);
+ tmp = viennacl::linalg::prod(matrix, p);
+
+ alpha = ip_rr / viennacl::linalg::inner_prod(tmp, p);
+
+ result += alpha * p;
+ residual -= alpha * tmp;
+ z = residual;
+ precond.apply(z);
+
+ if (static_cast<VectorT*>(&residual)==static_cast<VectorT*>(&z))
+ new_ip_rr = std::pow(viennacl::linalg::norm_2(residual),2);
+ else
+ new_ip_rr = viennacl::linalg::inner_prod(residual, z);
+
+ new_ipp_rr_over_norm_rhs = new_ip_rr / norm_rhs_squared;
+ if (monitor && monitor(result, std::sqrt(std::fabs(new_ipp_rr_over_norm_rhs)), monitor_data))
+ break;
+ if (std::fabs(new_ipp_rr_over_norm_rhs) < tag.tolerance() * tag.tolerance() || std::fabs(new_ip_rr) < tag.abs_tolerance() * tag.abs_tolerance()) //squared norms involved here
+ break;
+
+ beta = new_ip_rr / ip_rr;
+ ip_rr = new_ip_rr;
+
+ p = z + beta*p;
+ }
+
+ //store last error estimate:
+ tag.error(std::sqrt(std::fabs(new_ip_rr / norm_rhs_squared)));
+
+ return result;
+ }
+
+}
+
+
+
+/** @brief Implementation of the preconditioned conjugate gradient solver, generic implementation for non-ViennaCL types.
+*
+* Following Algorithm 9.1 in "Iterative Methods for Sparse Linear Systems" by Y. Saad
+*
+* @param matrix The system matrix
+* @param rhs The load vector
+* @param tag Solver configuration tag
+* @param precond A preconditioner. Precondition operation is done via member function apply()
+* @return The result vector
+*/
+template<typename MatrixT, typename VectorT, typename PreconditionerT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, cg_tag const & tag, PreconditionerT const & precond)
+{
+ return detail::solve_impl(matrix, rhs, tag, precond);
+}
+
+/** @brief Convenience overload for calling the CG solver using types from the C++ STL.
+ *
+ * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly.
+ * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out.
+ */
+template<typename IndexT, typename NumericT, typename PreconditionerT>
+std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, cg_tag const & tag, PreconditionerT const & precond)
+{
+ viennacl::compressed_matrix<NumericT> vcl_A;
+ viennacl::copy(A, vcl_A);
+
+ viennacl::vector<NumericT> vcl_rhs(rhs.size());
+ viennacl::copy(rhs, vcl_rhs);
+
+ viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
+
+ std::vector<NumericT> result(vcl_result.size());
+ viennacl::copy(vcl_result, result);
+ return result;
+}
+
+/** @brief Entry point for the unpreconditioned CG method.
+ *
+ * @param matrix The system matrix
+ * @param rhs Right hand side vector (load vector)
+ * @param tag A BiCGStab tag providing relative tolerances, etc.
+ */
+template<typename MatrixT, typename VectorT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, cg_tag const & tag)
+{
+ return solve(matrix, rhs, tag, viennacl::linalg::no_precond());
+}
+
+
+
+template<typename VectorT>
+class cg_solver
+{
+public:
+ typedef typename viennacl::result_of::cpu_value_type<VectorT>::type numeric_type;
+
+ cg_solver(cg_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {}
+
+ template<typename MatrixT, typename PreconditionerT>
+ VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const
+ {
+ if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account
+ {
+ VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
+ mod_rhs = b - mod_rhs;
+ VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_);
+ return init_guess_ + y;
+ }
+ return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_);
+ }
+
+
+ template<typename MatrixT>
+ VectorT operator()(MatrixT const & A, VectorT const & b) const
+ {
+ return operator()(A, b, viennacl::linalg::no_precond());
+ }
+
+ /** @brief Specifies an initial guess for the iterative solver.
+ *
+ * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
+ */
+ void set_initial_guess(VectorT const & x) { init_guess_ = x; }
+
+ /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor.
+ *
+ * The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument.
+ * The third argument is a pointer to user-defined data, through which additional information can be passed.
+ * This pointer needs to be set with set_monitor_data. If not set, NULL is passed.
+ * If the montior function returns true, the solver terminates (either convergence or divergence).
+ */
+ void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data)
+ {
+ monitor_callback_ = monitor_fun;
+ user_data_ = user_data;
+ }
+
+ /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */
+ cg_tag const & tag() const { return tag_; }
+
+private:
+ cg_tag tag_;
+ VectorT init_guess_;
+ bool (*monitor_callback_)(VectorT const &, numeric_type, void *);
+ void *user_data_;
+};
+
+
+}
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
new file mode 100644
index 0000000..5325b7b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/circulant_matrix_operations.hpp
@@ -0,0 +1,75 @@
+#ifndef VIENNACL_LINALG_CIRCULANT_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CIRCULANT_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/circulant_matrix_operations.hpp
+ @brief Implementations of operations using circulant_matrix. Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/fft.hpp"
+//#include "viennacl/linalg/kernels/coordinate_matrix_kernels.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication with a circulant_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::circulant_matrix<NumericT, AlignmentV> const & mat,
+ viennacl::vector_base<NumericT> const & vec,
+ viennacl::vector_base<NumericT> & result)
+{
+ assert(mat.size1() == result.size() && bool("Dimension mismatch"));
+ assert(mat.size2() == vec.size() && bool("Dimension mismatch"));
+ //result.clear();
+
+ //std::cout << "prod(circulant_matrix" << ALIGNMENT << ", vector) called with internal_nnz=" << mat.internal_nnz() << std::endl;
+
+ viennacl::vector<NumericT> circ(mat.elements().size() * 2);
+ viennacl::linalg::real_to_complex(mat.elements(), circ, mat.elements().size());
+
+ viennacl::vector<NumericT> tmp(vec.size() * 2);
+ viennacl::vector<NumericT> tmp2(vec.size() * 2);
+
+ viennacl::linalg::real_to_complex(vec, tmp, vec.size());
+ viennacl::linalg::convolve(circ, tmp, tmp2);
+ viennacl::linalg::complex_to_real(tmp2, result, vec.size());
+
+}
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[42/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/utils.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/utils.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/utils.hpp
new file mode 100644
index 0000000..67d089a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/utils.hpp
@@ -0,0 +1,105 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_REDUCTION_UTILS_HPP
+#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_REDUCTION_UTILS_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/templates/utils.hpp
+ *
+ * A collection of utilities for the device specific execution templates.
+*/
+
+#include <vector>
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/tree_parsing.hpp"
+#include "viennacl/device_specific/utils.hpp"
+
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+inline void compute_reduction(utils::kernel_generation_stream & os, std::string acc, std::string cur, scheduler::op_element const & op)
+{
+ if (utils::elementwise_function(op))
+ os << acc << "=" << tree_parsing::evaluate(op.type) << "(" << acc << "," << cur << ");" << std::endl;
+ else
+ os << acc << "= (" << acc << ")" << tree_parsing::evaluate(op.type) << "(" << cur << ");" << std::endl;
+}
+
+inline void compute_index_reduction(utils::kernel_generation_stream & os, std::string acc, std::string cur, std::string const & acc_value, std::string const & cur_value, scheduler::op_element const & op)
+{
+ // os << acc << " = " << cur_value << ">" << acc_value << "?" << cur << ":" << acc << ";" << std::endl;
+ os << acc << "= select(" << acc << "," << cur << "," << cur_value << ">" << acc_value << ");" << std::endl;
+ os << acc_value << "=";
+ if (op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE) os << "fmax";
+ if (op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE) os << "max";
+ if (op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE) os << "fmin";
+ if (op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE) os << "min";
+ os << "(" << acc_value << "," << cur_value << ");"<< std::endl;
+}
+
+inline void process_all(std::string const & type_key, std::string const & str,
+ utils::kernel_generation_stream & stream, std::vector<mapping_type> const & mappings)
+{
+ for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
+ for (mapping_type::const_iterator mmit = mit->begin(); mmit != mit->end(); ++mmit)
+ if (mmit->second->type_key()==type_key)
+ stream << mmit->second->process(str) << std::endl;
+}
+
+
+inline void process_all_at(std::string const & type_key, std::string const & str,
+ utils::kernel_generation_stream & stream, std::vector<mapping_type> const & mappings,
+ vcl_size_t root_idx, leaf_t leaf)
+{
+ for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
+ {
+ mapped_object * obj = at(*mit, mapping_key(root_idx, leaf)).get();
+ if (obj->type_key()==type_key)
+ stream << obj->process(str) << std::endl;
+ }
+}
+
+inline std::string neutral_element(scheduler::op_element const & op)
+{
+ switch (op.type)
+ {
+ case scheduler::OPERATION_BINARY_ADD_TYPE : return "0";
+ case scheduler::OPERATION_BINARY_MULT_TYPE : return "1";
+ case scheduler::OPERATION_BINARY_DIV_TYPE : return "1";
+ case scheduler::OPERATION_BINARY_ELEMENT_FMAX_TYPE : return "-INFINITY";
+ case scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE : return "-INFINITY";
+ case scheduler::OPERATION_BINARY_ELEMENT_MAX_TYPE : return "-INFINITY";
+ case scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE : return "-INFINITY";
+ case scheduler::OPERATION_BINARY_ELEMENT_FMIN_TYPE : return "INFINITY";
+ case scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE : return "INFINITY";
+ case scheduler::OPERATION_BINARY_ELEMENT_MIN_TYPE : return "INFINITY";
+ case scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE : return "INFINITY";
+
+ default: throw generator_not_supported_exception("Unsupported reduction operator : no neutral element known");
+ }
+}
+
+}
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/tree_parsing.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/tree_parsing.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/tree_parsing.hpp
new file mode 100644
index 0000000..f9cc8a8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/tree_parsing.hpp
@@ -0,0 +1,512 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_TREE_PARSING_HPP
+#define VIENNACL_DEVICE_SPECIFIC_TREE_PARSING_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/tree_parsing.hpp
+ @brief Code for parsing the expression trees.
+*/
+
+#include <set>
+
+#include "viennacl/forwards.h"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/utils.hpp"
+#include "viennacl/device_specific/mapped_objects.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace tree_parsing
+{
+
+/** @brief base functor class for traversing a statement */
+class traversal_functor
+{
+public:
+ void call_before_expansion(scheduler::statement const &, vcl_size_t) const { }
+ void call_after_expansion(scheduler::statement const &, vcl_size_t) const { }
+};
+
+/** @brief Recursively execute a functor on a statement */
+template<class Fun>
+inline void traverse(scheduler::statement const & statement, vcl_size_t root_idx, Fun const & fun, bool inspect)
+{
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+ bool recurse = utils::node_leaf(root_node.op)?inspect:true;
+
+ fun.call_before_expansion(statement, root_idx);
+
+ //Lhs:
+ if (recurse)
+ {
+ if (root_node.lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+ traverse(statement, root_node.lhs.node_index, fun, inspect);
+ if (root_node.lhs.type_family != scheduler::INVALID_TYPE_FAMILY)
+ fun(statement, root_idx, LHS_NODE_TYPE);
+ }
+
+ //Self:
+ fun(statement, root_idx, PARENT_NODE_TYPE);
+
+ //Rhs:
+ if (recurse && root_node.rhs.type_family!=scheduler::INVALID_TYPE_FAMILY)
+ {
+ if (root_node.rhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+ traverse(statement, root_node.rhs.node_index, fun, inspect);
+ if (root_node.rhs.type_family != scheduler::INVALID_TYPE_FAMILY)
+ fun(statement, root_idx, RHS_NODE_TYPE);
+ }
+
+ fun.call_after_expansion(statement, root_idx);
+}
+
+class filter : public traversal_functor
+{
+public:
+ typedef bool (*pred_t)(scheduler::statement_node const & node);
+
+ filter(pred_t pred, std::vector<vcl_size_t> & out) : pred_(pred), out_(out){ }
+
+ void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t) const
+ {
+ scheduler::statement_node const * root_node = &statement.array()[root_idx];
+ if (pred_(*root_node))
+ out_.push_back(root_idx);
+ }
+private:
+ pred_t pred_;
+ std::vector<vcl_size_t> & out_;
+};
+
+class filter_elements : public traversal_functor
+{
+public:
+ filter_elements(scheduler::statement_node_subtype subtype, std::vector<scheduler::lhs_rhs_element> & out) : subtype_(subtype), out_(out) { }
+
+ void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t) const
+ {
+ scheduler::statement_node const * root_node = &statement.array()[root_idx];
+ if (root_node->lhs.subtype==subtype_)
+ out_.push_back(root_node->lhs);
+ if (root_node->rhs.subtype==subtype_)
+ out_.push_back(root_node->rhs);
+ }
+private:
+ scheduler::statement_node_subtype subtype_;
+ std::vector<scheduler::lhs_rhs_element> & out_;
+};
+
+/** @brief generate a string from an operation_node_type */
+inline const char * evaluate(scheduler::operation_node_type type)
+{
+ using namespace scheduler;
+ // unary expression
+ switch (type)
+ {
+ //Function
+ case OPERATION_UNARY_ABS_TYPE : return "abs";
+ case OPERATION_UNARY_ACOS_TYPE : return "acos";
+ case OPERATION_UNARY_ASIN_TYPE : return "asin";
+ case OPERATION_UNARY_ATAN_TYPE : return "atan";
+ case OPERATION_UNARY_CEIL_TYPE : return "ceil";
+ case OPERATION_UNARY_COS_TYPE : return "cos";
+ case OPERATION_UNARY_COSH_TYPE : return "cosh";
+ case OPERATION_UNARY_EXP_TYPE : return "exp";
+ case OPERATION_UNARY_FABS_TYPE : return "fabs";
+ case OPERATION_UNARY_FLOOR_TYPE : return "floor";
+ case OPERATION_UNARY_LOG_TYPE : return "log";
+ case OPERATION_UNARY_LOG10_TYPE : return "log10";
+ case OPERATION_UNARY_SIN_TYPE : return "sin";
+ case OPERATION_UNARY_SINH_TYPE : return "sinh";
+ case OPERATION_UNARY_SQRT_TYPE : return "sqrt";
+ case OPERATION_UNARY_TAN_TYPE : return "tan";
+ case OPERATION_UNARY_TANH_TYPE : return "tanh";
+
+ case OPERATION_UNARY_CAST_CHAR_TYPE : return "(char)";
+ case OPERATION_UNARY_CAST_UCHAR_TYPE : return "(uchar)";
+ case OPERATION_UNARY_CAST_SHORT_TYPE : return "(short)";
+ case OPERATION_UNARY_CAST_USHORT_TYPE : return "(ushort)";
+ case OPERATION_UNARY_CAST_INT_TYPE : return "(int)";
+ case OPERATION_UNARY_CAST_UINT_TYPE : return "(uint)";
+ case OPERATION_UNARY_CAST_LONG_TYPE : return "(long)";
+ case OPERATION_UNARY_CAST_ULONG_TYPE : return "(ulong)";
+ case OPERATION_UNARY_CAST_HALF_TYPE : return "(half)";
+ case OPERATION_UNARY_CAST_FLOAT_TYPE : return "(float)";
+ case OPERATION_UNARY_CAST_DOUBLE_TYPE : return "(double)";
+
+ case OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE : return "argfmax";
+ case OPERATION_BINARY_ELEMENT_ARGMAX_TYPE : return "argmax";
+ case OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE : return "argfmin";
+ case OPERATION_BINARY_ELEMENT_ARGMIN_TYPE : return "argmin";
+ case OPERATION_BINARY_ELEMENT_POW_TYPE : return "pow";
+
+ //Arithmetic
+ case OPERATION_UNARY_MINUS_TYPE : return "-";
+ case OPERATION_BINARY_ASSIGN_TYPE : return "=";
+ case OPERATION_BINARY_INPLACE_ADD_TYPE : return "+=";
+ case OPERATION_BINARY_INPLACE_SUB_TYPE : return "-=";
+ case OPERATION_BINARY_ADD_TYPE : return "+";
+ case OPERATION_BINARY_SUB_TYPE : return "-";
+ case OPERATION_BINARY_MULT_TYPE : return "*";
+ case OPERATION_BINARY_ELEMENT_PROD_TYPE : return "*";
+ case OPERATION_BINARY_DIV_TYPE : return "/";
+ case OPERATION_BINARY_ELEMENT_DIV_TYPE : return "/";
+ case OPERATION_BINARY_ACCESS_TYPE : return "[]";
+
+ //Relational
+ case OPERATION_BINARY_ELEMENT_EQ_TYPE : return "isequal";
+ case OPERATION_BINARY_ELEMENT_NEQ_TYPE : return "isnotequal";
+ case OPERATION_BINARY_ELEMENT_GREATER_TYPE : return "isgreater";
+ case OPERATION_BINARY_ELEMENT_GEQ_TYPE : return "isgreaterequal";
+ case OPERATION_BINARY_ELEMENT_LESS_TYPE : return "isless";
+ case OPERATION_BINARY_ELEMENT_LEQ_TYPE : return "islessequal";
+
+ case OPERATION_BINARY_ELEMENT_FMAX_TYPE : return "fmax";
+ case OPERATION_BINARY_ELEMENT_FMIN_TYPE : return "fmin";
+ case OPERATION_BINARY_ELEMENT_MAX_TYPE : return "max";
+ case OPERATION_BINARY_ELEMENT_MIN_TYPE : return "min";
+ //Unary
+ case OPERATION_UNARY_TRANS_TYPE : return "trans";
+
+ //Binary
+ case OPERATION_BINARY_INNER_PROD_TYPE : return "iprod";
+ case OPERATION_BINARY_MAT_MAT_PROD_TYPE : return "mmprod";
+ case OPERATION_BINARY_MAT_VEC_PROD_TYPE : return "mvprod";
+ case OPERATION_BINARY_VECTOR_DIAG_TYPE : return "vdiag";
+ case OPERATION_BINARY_MATRIX_DIAG_TYPE : return "mdiag";
+ case OPERATION_BINARY_MATRIX_ROW_TYPE : return "row";
+ case OPERATION_BINARY_MATRIX_COLUMN_TYPE : return "col";
+
+ default : throw generator_not_supported_exception("Unsupported operator");
+ }
+}
+
+inline const char * operator_string(scheduler::operation_node_type type)
+{
+ using namespace scheduler;
+ switch (type)
+ {
+ case OPERATION_UNARY_CAST_CHAR_TYPE : return "char";
+ case OPERATION_UNARY_CAST_UCHAR_TYPE : return "uchar";
+ case OPERATION_UNARY_CAST_SHORT_TYPE : return "short";
+ case OPERATION_UNARY_CAST_USHORT_TYPE : return "ushort";
+ case OPERATION_UNARY_CAST_INT_TYPE : return "int";
+ case OPERATION_UNARY_CAST_UINT_TYPE : return "uint";
+ case OPERATION_UNARY_CAST_LONG_TYPE : return "long";
+ case OPERATION_UNARY_CAST_ULONG_TYPE : return "ulong";
+ case OPERATION_UNARY_CAST_HALF_TYPE : return "half";
+ case OPERATION_UNARY_CAST_FLOAT_TYPE : return "float";
+ case OPERATION_UNARY_CAST_DOUBLE_TYPE : return "double";
+
+ case OPERATION_UNARY_MINUS_TYPE : return "umin";
+ case OPERATION_BINARY_ASSIGN_TYPE : return "assign";
+ case OPERATION_BINARY_INPLACE_ADD_TYPE : return "ip_add";
+ case OPERATION_BINARY_INPLACE_SUB_TYPE : return "ip_sub";
+ case OPERATION_BINARY_ADD_TYPE : return "add";
+ case OPERATION_BINARY_SUB_TYPE : return "sub";
+ case OPERATION_BINARY_MULT_TYPE : return "mult";
+ case OPERATION_BINARY_ELEMENT_PROD_TYPE : return "eprod";
+ case OPERATION_BINARY_DIV_TYPE : return "div";
+ case OPERATION_BINARY_ELEMENT_DIV_TYPE : return "ediv";
+ case OPERATION_BINARY_ACCESS_TYPE : return "acc";
+ default : return evaluate(type);
+ }
+}
+
+/** @brief functor for generating the expression string from a statement */
+class evaluate_expression_traversal: public tree_parsing::traversal_functor
+{
+private:
+ std::map<std::string, std::string> const & accessors_;
+ std::string & str_;
+ mapping_type const & mapping_;
+
+public:
+ evaluate_expression_traversal(std::map<std::string, std::string> const & accessors, std::string & str, mapping_type const & mapping) : accessors_(accessors), str_(str), mapping_(mapping){ }
+
+ void call_before_expansion(scheduler::statement const & statement, vcl_size_t root_idx) const
+ {
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+ if ((root_node.op.type_family==scheduler::OPERATION_UNARY_TYPE_FAMILY || utils::elementwise_function(root_node.op))
+ && !utils::node_leaf(root_node.op))
+ str_+=tree_parsing::evaluate(root_node.op.type);
+ str_+="(";
+
+ }
+
+ void call_after_expansion(scheduler::statement const & /*statement*/, vcl_size_t /*root_idx*/) const
+ {
+ str_+=")";
+ }
+
+ void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf) const
+ {
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+ mapping_type::key_type key = std::make_pair(root_idx, leaf);
+ if (leaf==PARENT_NODE_TYPE)
+ {
+ if (utils::node_leaf(root_node.op))
+ str_ += at(mapping_, key)->evaluate(accessors_);
+ else if (utils::elementwise_operator(root_node.op))
+ str_ += tree_parsing::evaluate(root_node.op.type);
+ else if (root_node.op.type_family!=scheduler::OPERATION_UNARY_TYPE_FAMILY && utils::elementwise_function(root_node.op))
+ str_ += ",";
+ }
+ else
+ {
+ if (leaf==LHS_NODE_TYPE)
+ {
+ if (root_node.lhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY)
+ str_ += at(mapping_, key)->evaluate(accessors_);
+ }
+
+ if (leaf==RHS_NODE_TYPE)
+ {
+ if (root_node.rhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY)
+ str_ += at(mapping_, key)->evaluate(accessors_);
+ }
+ }
+ }
+};
+
+inline std::string evaluate(leaf_t leaf, std::map<std::string, std::string> const & accessors,
+ scheduler::statement const & statement, vcl_size_t root_idx, mapping_type const & mapping)
+{
+ std::string res;
+ evaluate_expression_traversal traversal_functor(accessors, res, mapping);
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+
+ if (leaf==RHS_NODE_TYPE)
+ {
+ if (root_node.rhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+ tree_parsing::traverse(statement, root_node.rhs.node_index, traversal_functor, false);
+ else
+ traversal_functor(statement, root_idx, leaf);
+ }
+ else if (leaf==LHS_NODE_TYPE)
+ {
+ if (root_node.lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+ tree_parsing::traverse(statement, root_node.lhs.node_index, traversal_functor, false);
+ else
+ traversal_functor(statement, root_idx, leaf);
+ }
+ else
+ tree_parsing::traverse(statement, root_idx, traversal_functor, false);
+
+ return res;
+}
+
+inline void evaluate(utils::kernel_generation_stream & stream, leaf_t leaf, std::map<std::string, std::string> const & accessors,
+ statements_container const & statements, std::vector<mapping_type> const & mappings)
+{
+ statements_container::data_type::const_iterator sit;
+ std::vector<mapping_type>::const_iterator mit;
+
+ for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++mit, ++sit)
+ stream << evaluate(leaf, accessors, *sit, sit->root(), *mit) << ";" << std::endl;
+}
+
+
+/** @brief functor for fetching or writing-back the elements in a statement */
+class process_traversal : public tree_parsing::traversal_functor
+{
+public:
+ process_traversal(std::string const & type_key, std::string const & to_process, utils::kernel_generation_stream & stream,
+ mapping_type const & mapping, std::set<std::string> & already_processed) : type_key_(type_key), to_process_(to_process), stream_(stream), mapping_(mapping), already_processed_(already_processed){ }
+
+ void operator()(scheduler::statement const & /*statement*/, vcl_size_t root_idx, leaf_t leaf) const
+ {
+ mapping_type::const_iterator it = mapping_.find(std::make_pair(root_idx, leaf));
+ if (it!=mapping_.end())
+ {
+ mapped_object * obj = it->second.get();
+ if (obj->type_key()==type_key_)
+ {
+ if (already_processed_.insert(obj->process("#name")).second)
+ stream_ << obj->process(to_process_) << std::endl;
+ }
+ }
+ }
+
+private:
+ std::string const & type_key_;
+ std::string const & to_process_;
+ utils::kernel_generation_stream & stream_;
+ mapping_type const & mapping_;
+ std::set<std::string> & already_processed_;
+};
+
+inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & type_key, std::string const & to_process,
+ scheduler::statement const & statement, vcl_size_t root_idx, mapping_type const & mapping, std::set<std::string> & already_processed)
+{
+ process_traversal traversal_functor(type_key, to_process, stream, mapping, already_processed);
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+
+ if (leaf==RHS_NODE_TYPE)
+ {
+ if (root_node.rhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+ tree_parsing::traverse(statement, root_node.rhs.node_index, traversal_functor, true);
+ else
+ traversal_functor(statement, root_idx, leaf);
+ }
+ else if (leaf==LHS_NODE_TYPE)
+ {
+ if (root_node.lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+ tree_parsing::traverse(statement, root_node.lhs.node_index, traversal_functor, true);
+ else
+ traversal_functor(statement, root_idx, leaf);
+ }
+ else
+ {
+ tree_parsing::traverse(statement, root_idx, traversal_functor, true);
+ }
+}
+
+inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & type_key, std::string const & to_process,
+ statements_container const & statements, std::vector<mapping_type> const & mappings)
+{
+ statements_container::data_type::const_iterator sit;
+ std::vector<mapping_type>::const_iterator mit;
+ std::set<std::string> already_processed;
+
+ for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++mit, ++sit)
+ process(stream, leaf, type_key, to_process, *sit, sit->root(), *mit, already_processed);
+}
+
+
+class statement_representation_functor : public traversal_functor{
+private:
+ static void append_id(char * & ptr, unsigned int val)
+ {
+ if (val==0)
+ *ptr++='0';
+ else
+ while (val>0)
+ {
+ *ptr++= (char)('0' + (val % 10));
+ val /= 10;
+ }
+ }
+
+public:
+ typedef void result_type;
+
+ statement_representation_functor(symbolic_binder & binder, char *& ptr) : binder_(binder), ptr_(ptr){ }
+
+ template<class NumericT>
+ inline result_type operator()(NumericT const & /*scal*/) const
+ {
+ *ptr_++='h'; //host
+ *ptr_++='s'; //scalar
+ *ptr_++=utils::first_letter_of_type<NumericT>::value();
+ }
+
+ /** @brief Scalar mapping */
+ template<class NumericT>
+ inline result_type operator()(scalar<NumericT> const & scal) const
+ {
+ *ptr_++='s'; //scalar
+ *ptr_++=utils::first_letter_of_type<NumericT>::value();
+ append_id(ptr_, binder_.get(&traits::handle(scal)));
+ }
+
+ /** @brief Vector mapping */
+ template<class NumericT>
+ inline result_type operator()(vector_base<NumericT> const & vec) const
+ {
+ *ptr_++='v'; //vector
+ *ptr_++=utils::first_letter_of_type<NumericT>::value();
+ append_id(ptr_, binder_.get(&traits::handle(vec)));
+ }
+
+ /** @brief Implicit vector mapping */
+ template<class NumericT>
+ inline result_type operator()(implicit_vector_base<NumericT> const & /*vec*/) const
+ {
+ *ptr_++='i'; //implicit
+ *ptr_++='v'; //vector
+ *ptr_++=utils::first_letter_of_type<NumericT>::value();
+ }
+
+ /** @brief Matrix mapping */
+ template<class NumericT>
+ inline result_type operator()(matrix_base<NumericT> const & mat) const
+ {
+ *ptr_++='m'; //Matrix
+ *ptr_++=mat.row_major()?'r':'c';
+ *ptr_++=utils::first_letter_of_type<NumericT>::value();
+ append_id(ptr_, binder_.get(&traits::handle(mat)));
+ }
+
+ /** @brief Implicit matrix mapping */
+ template<class NumericT>
+ inline result_type operator()(implicit_matrix_base<NumericT> const & /*mat*/) const
+ {
+ *ptr_++='i'; //implicit
+ *ptr_++='m'; //matrix
+ *ptr_++=utils::first_letter_of_type<NumericT>::value();
+ }
+
+ static inline void append(char*& p, const char * str)
+ {
+ vcl_size_t n = std::strlen(str);
+ std::memcpy(p, str, n);
+ p+=n;
+ }
+
+ inline void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf_t) const
+ {
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+ if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+ utils::call_on_element(root_node.lhs, *this);
+ else if (root_node.op.type_family==scheduler::OPERATION_BINARY_TYPE_FAMILY && leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+ utils::call_on_element(root_node.rhs, *this);
+ else if (leaf_t==PARENT_NODE_TYPE)
+ append_id(ptr_,root_node.op.type);
+ }
+
+private:
+ symbolic_binder & binder_;
+ char *& ptr_;
+};
+
+inline std::string statements_representation(statements_container const & statements, binding_policy_t binding_policy)
+{
+ std::vector<char> program_name_vector(256);
+ char* program_name = &(program_name_vector[0]);
+ if (statements.order()==statements_container::INDEPENDENT)
+ *program_name++='i';
+ else
+ *program_name++='s';
+ tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy);
+ for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
+ tree_parsing::traverse(*it, it->root(), tree_parsing::statement_representation_functor(*binder, program_name),true);
+ *program_name='\0';
+ return std::string(&(program_name_vector[0]));
+}
+
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/utils.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/utils.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/utils.hpp
new file mode 100644
index 0000000..1f1fc60
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/utils.hpp
@@ -0,0 +1,568 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_UTILS_HPP
+#define VIENNACL_DEVICE_SPECIFIC_UTILS_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/utils.hpp
+ @brief Internal utils
+*/
+
+#include <sstream>
+
+#include "viennacl/detail/matrix_def.hpp"
+#include "viennacl/detail/vector_def.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/ocl/forwards.h"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/row_major.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace utils
+{
+
+//CUDA Conversion
+inline std::string opencl_source_to_cuda_source(std::string const & opencl_src)
+{
+ std::string res = opencl_src;
+
+ viennacl::tools::find_and_replace(res,"__attribute__","//__attribute__");
+
+ //Pointer
+ viennacl::tools::find_and_replace(res, "__global float*", "float*");
+ viennacl::tools::find_and_replace(res, "__local float*", "float*");
+
+ viennacl::tools::find_and_replace(res, "__global double*", "double*");
+ viennacl::tools::find_and_replace(res, "__local double*", "double*");
+
+ //Qualifiers
+ viennacl::tools::find_and_replace(res,"__global","__device__");
+ viennacl::tools::find_and_replace(res,"__kernel","__global__");
+ viennacl::tools::find_and_replace(res,"__constant","__constant__");
+ viennacl::tools::find_and_replace(res,"__local","__shared__");
+
+ //Indexing
+ viennacl::tools::find_and_replace(res,"get_num_groups(0)","gridDim.x");
+ viennacl::tools::find_and_replace(res,"get_num_groups(1)","gridDim.y");
+
+ viennacl::tools::find_and_replace(res,"get_local_size(0)","blockDim.x");
+ viennacl::tools::find_and_replace(res,"get_local_size(1)","blockDim.y");
+
+ viennacl::tools::find_and_replace(res,"get_group_id(0)","blockIdx.x");
+ viennacl::tools::find_and_replace(res,"get_group_id(1)","blockIdx.y");
+
+ viennacl::tools::find_and_replace(res,"get_local_id(0)","threadIdx.x");
+ viennacl::tools::find_and_replace(res,"get_local_id(1)","threadIdx.y");
+
+ viennacl::tools::find_and_replace(res,"get_global_id(0)","(blockIdx.x*blockDim.x + threadIdx.x)");
+ viennacl::tools::find_and_replace(res,"get_global_id(1)","(blockIdx.y*blockDim.y + threadIdx.y)");
+
+ //Synchronization
+ viennacl::tools::find_and_replace(res,"barrier(CLK_LOCAL_MEM_FENCE)","__syncthreads()");
+ viennacl::tools::find_and_replace(res,"barrier(CLK_GLOBAL_MEM_FENCE)","__syncthreads()");
+
+
+ return res;
+}
+
+static std::string numeric_type_to_string(scheduler::statement_node_numeric_type const & type){
+ switch (type)
+ {
+ //case scheduler::CHAR_TYPE: return "char";
+ //case scheduler::UCHAR_TYPE: return "unsigned char";
+ //case scheduler::SHORT_TYPE: return "short";
+ //case scheduler::USHORT_TYPE: return "unsigned short";
+ case scheduler::INT_TYPE: return "int";
+ case scheduler::UINT_TYPE: return "unsigned int";
+ case scheduler::LONG_TYPE: return "long";
+ case scheduler::ULONG_TYPE: return "unsigned long";
+ case scheduler::FLOAT_TYPE : return "float";
+ case scheduler::DOUBLE_TYPE : return "double";
+ default : throw generator_not_supported_exception("Unsupported Scalartype");
+ }
+}
+
+
+template<class Fun>
+static typename Fun::result_type call_on_host_scalar(scheduler::lhs_rhs_element element, Fun const & fun){
+ assert(element.type_family == scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a host scalar"));
+ switch (element.numeric_type)
+ {
+ //case scheduler::CHAR_TYPE: return fun(element.host_char);
+ //case scheduler::UCHAR_TYPE: return fun(element.host_uchar);
+ //case scheduler::SHORT_TYPE: return fun(element.host_short);
+ //case scheduler::USHORT_TYPE: return fun(element.host_ushort);
+ case scheduler::INT_TYPE: return fun(element.host_int);
+ case scheduler::UINT_TYPE: return fun(element.host_uint);
+ case scheduler::LONG_TYPE: return fun(element.host_long);
+ case scheduler::ULONG_TYPE: return fun(element.host_ulong);
+ case scheduler::FLOAT_TYPE : return fun(element.host_float);
+ case scheduler::DOUBLE_TYPE : return fun(element.host_double);
+ default : throw generator_not_supported_exception("Unsupported Scalartype");
+ }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_scalar(scheduler::lhs_rhs_element element, Fun const & fun){
+ assert(element.type_family == scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a scalar"));
+ switch (element.numeric_type)
+ {
+ //case scheduler::CHAR_TYPE: return fun(*element.scalar_char);
+ //case scheduler::UCHAR_TYPE: return fun(*element.scalar_uchar);
+ //case scheduler::SHORT_TYPE: return fun(*element.scalar_short);
+ //case scheduler::USHORT_TYPE: return fun(*element.scalar_ushort);
+ case scheduler::INT_TYPE: return fun(*element.scalar_int);
+ case scheduler::UINT_TYPE: return fun(*element.scalar_uint);
+ case scheduler::LONG_TYPE: return fun(*element.scalar_long);
+ case scheduler::ULONG_TYPE: return fun(*element.scalar_ulong);
+ case scheduler::FLOAT_TYPE : return fun(*element.scalar_float);
+ case scheduler::DOUBLE_TYPE : return fun(*element.scalar_double);
+ default : throw generator_not_supported_exception("Unsupported Scalartype");
+ }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_vector(scheduler::lhs_rhs_element element, Fun const & fun){
+ assert(element.type_family == scheduler::VECTOR_TYPE_FAMILY && bool("Must be called on a vector"));
+ switch (element.numeric_type)
+ {
+ //case scheduler::CHAR_TYPE: return fun(*element.vector_char);
+ //case scheduler::UCHAR_TYPE: return fun(*element.vector_uchar);
+ //case scheduler::SHORT_TYPE: return fun(*element.vector_short);
+ //case scheduler::USHORT_TYPE: return fun(*element.vector_ushort);
+ case scheduler::INT_TYPE: return fun(*element.vector_int);
+ case scheduler::UINT_TYPE: return fun(*element.vector_uint);
+ case scheduler::LONG_TYPE: return fun(*element.vector_long);
+ case scheduler::ULONG_TYPE: return fun(*element.vector_ulong);
+ case scheduler::FLOAT_TYPE : return fun(*element.vector_float);
+ case scheduler::DOUBLE_TYPE : return fun(*element.vector_double);
+ default : throw generator_not_supported_exception("Unsupported Scalartype");
+ }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_implicit_vector(scheduler::lhs_rhs_element element, Fun const & fun){
+ assert(element.type_family == scheduler::VECTOR_TYPE_FAMILY && bool("Must be called on a implicit_vector"));
+ assert(element.subtype == scheduler::IMPLICIT_VECTOR_TYPE && bool("Must be called on a implicit_vector"));
+ switch (element.numeric_type)
+ {
+ //case scheduler::CHAR_TYPE: return fun(*element.implicit_vector_char);
+ //case scheduler::UCHAR_TYPE: return fun(*element.implicit_vector_uchar);
+ //case scheduler::SHORT_TYPE: return fun(*element.implicit_vector_short);
+ //case scheduler::USHORT_TYPE: return fun(*element.implicit_vector_ushort);
+ case scheduler::INT_TYPE: return fun(*element.implicit_vector_int);
+ case scheduler::UINT_TYPE: return fun(*element.implicit_vector_uint);
+ case scheduler::LONG_TYPE: return fun(*element.implicit_vector_long);
+ case scheduler::ULONG_TYPE: return fun(*element.implicit_vector_ulong);
+ case scheduler::FLOAT_TYPE : return fun(*element.implicit_vector_float);
+ case scheduler::DOUBLE_TYPE : return fun(*element.implicit_vector_double);
+ default : throw generator_not_supported_exception("Unsupported Scalartype");
+ }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_matrix(scheduler::lhs_rhs_element element, Fun const & fun){
+ assert(element.type_family == scheduler::MATRIX_TYPE_FAMILY && bool("Must be called on a matrix"));
+ switch (element.numeric_type)
+ {
+ //case scheduler::CHAR_TYPE: return fun(*element.matrix_char);
+ //case scheduler::UCHAR_TYPE: return fun(*element.matrix_uchar);
+ //case scheduler::SHORT_TYPE: return fun(*element.matrix_short);
+ //case scheduler::USHORT_TYPE: return fun(*element.matrix_ushort);
+ case scheduler::INT_TYPE: return fun(*element.matrix_int);
+ case scheduler::UINT_TYPE: return fun(*element.matrix_uint);
+ case scheduler::LONG_TYPE: return fun(*element.matrix_long);
+ case scheduler::ULONG_TYPE: return fun(*element.matrix_ulong);
+ case scheduler::FLOAT_TYPE : return fun(*element.matrix_float);
+ case scheduler::DOUBLE_TYPE : return fun(*element.matrix_double);
+ default : throw generator_not_supported_exception("Unsupported Scalartype");
+ }
+}
+
+
+template<class Fun>
+static typename Fun::result_type call_on_implicit_matrix(scheduler::lhs_rhs_element element, Fun const & fun){
+ assert(element.subtype == scheduler::IMPLICIT_MATRIX_TYPE && bool("Must be called on a implicit matrix"));
+ switch (element.numeric_type)
+ {
+ //case scheduler::CHAR_TYPE: return fun(*element.implicit_matrix_char);
+ //case scheduler::UCHAR_TYPE: return fun(*element.implicit_matrix_uchar);
+ //case scheduler::SHORT_TYPE: return fun(*element.implicit_matrix_short);
+ //case scheduler::USHORT_TYPE: return fun(*element.implicit_matrix_ushort);
+ case scheduler::INT_TYPE: return fun(*element.implicit_matrix_int);
+ case scheduler::UINT_TYPE: return fun(*element.implicit_matrix_uint);
+ case scheduler::LONG_TYPE: return fun(*element.implicit_matrix_long);
+ case scheduler::ULONG_TYPE: return fun(*element.implicit_matrix_ulong);
+ case scheduler::FLOAT_TYPE : return fun(*element.implicit_matrix_float);
+ case scheduler::DOUBLE_TYPE : return fun(*element.implicit_matrix_double);
+ default : throw generator_not_supported_exception("Unsupported Scalartype");
+ }
+}
+
+template<class Fun>
+static typename Fun::result_type call_on_element(scheduler::lhs_rhs_element const & element, Fun const & fun){
+ switch (element.type_family)
+ {
+ case scheduler::SCALAR_TYPE_FAMILY:
+ if (element.subtype == scheduler::HOST_SCALAR_TYPE)
+ return call_on_host_scalar(element, fun);
+ else
+ return call_on_scalar(element, fun);
+ case scheduler::VECTOR_TYPE_FAMILY :
+ if (element.subtype == scheduler::IMPLICIT_VECTOR_TYPE)
+ return call_on_implicit_vector(element, fun);
+ else
+ return call_on_vector(element, fun);
+ case scheduler::MATRIX_TYPE_FAMILY:
+ if (element.subtype == scheduler::IMPLICIT_MATRIX_TYPE)
+ return call_on_implicit_matrix(element, fun);
+ else
+ return call_on_matrix(element,fun);
+ default:
+ throw generator_not_supported_exception("Unsupported datastructure type : Not among {Scalar, Vector, Matrix}");
+ }
+}
+
+struct scalartype_size_fun
+{
+ typedef vcl_size_t result_type;
+ result_type operator()(float const &) const { return sizeof(float); }
+ result_type operator()(double const &) const { return sizeof(double); }
+ template<class T> result_type operator()(T const &) const { return sizeof(typename viennacl::result_of::cpu_value_type<T>::type); }
+};
+
+struct internal_size_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::internal_size(t); }
+};
+
+struct size_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::size(t); }
+};
+
+struct stride_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride(t); }
+};
+
+struct start1_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::start1(t); }
+};
+
+struct start2_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::start2(t); }
+};
+
+struct leading_stride
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::row_major(t)?viennacl::traits::stride2(t):viennacl::traits::stride1(t); }
+};
+
+struct leading_start
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::row_major(t)?viennacl::traits::start2(t):viennacl::traits::start1(t); }
+};
+
+struct stride1_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride1(t); }
+};
+
+struct stride2_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride2(t); }
+};
+
+struct handle_fun
+{
+ typedef cl_mem result_type;
+ template<class T>
+ result_type operator()(T const &t) const { return viennacl::traits::opencl_handle(t); }
+};
+
+struct internal_size1_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T>
+ result_type operator()(T const &t) const { return viennacl::traits::internal_size1(t); }
+};
+
+struct row_major_fun
+{
+ typedef bool result_type;
+ template<class T>
+ result_type operator()(T const &t) const { return viennacl::traits::row_major(t); }
+};
+
+struct internal_size2_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T>
+ result_type operator()(T const &t) const { return viennacl::traits::internal_size2(t); }
+};
+
+struct size1_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T>
+ result_type operator()(T const &t) const { return viennacl::traits::size1(t); }
+};
+
+struct size2_fun
+{
+ typedef vcl_size_t result_type;
+ template<class T>
+ result_type operator()(T const &t) const { return viennacl::traits::size2(t); }
+};
+
+template<class T, class U>
+struct is_same_type { enum { value = 0 }; };
+
+template<class T>
+struct is_same_type<T,T> { enum { value = 1 }; };
+
+inline bool is_reduction(scheduler::statement_node const & node)
+{
+ return node.op.type_family==scheduler::OPERATION_VECTOR_REDUCTION_TYPE_FAMILY
+ || node.op.type_family==scheduler::OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY
+ || node.op.type_family==scheduler::OPERATION_ROWS_REDUCTION_TYPE_FAMILY
+ || node.op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE
+ || node.op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE;
+}
+
+inline bool is_index_reduction(scheduler::op_element const & op)
+{
+ return op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE
+ || op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE
+ || op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE
+ || op.type==scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE;
+}
+template<class T>
+struct type_to_string;
+template<> struct type_to_string<unsigned char> { static const char * value() { return "uchar"; } };
+template<> struct type_to_string<char> { static const char * value() { return "char"; } };
+template<> struct type_to_string<unsigned short> { static const char * value() { return "ushort"; } };
+template<> struct type_to_string<short> { static const char * value() { return "short"; } };
+template<> struct type_to_string<unsigned int> { static const char * value() { return "uint"; } };
+template<> struct type_to_string<int> { static const char * value() { return "int"; } };
+template<> struct type_to_string<unsigned long> { static const char * value() { return "ulong"; } };
+template<> struct type_to_string<long> { static const char * value() { return "long"; } };
+template<> struct type_to_string<float> { static const char * value() { return "float"; } };
+template<> struct type_to_string<double> { static const char * value() { return "double"; } };
+
+
+template<class T>
+struct first_letter_of_type;
+template<> struct first_letter_of_type<char> { static char value() { return 'c'; } };
+template<> struct first_letter_of_type<unsigned char> { static char value() { return 'd'; } };
+template<> struct first_letter_of_type<short> { static char value() { return 's'; } };
+template<> struct first_letter_of_type<unsigned short> { static char value() { return 't'; } };
+template<> struct first_letter_of_type<int> { static char value() { return 'i'; } };
+template<> struct first_letter_of_type<unsigned int> { static char value() { return 'j'; } };
+template<> struct first_letter_of_type<long> { static char value() { return 'l'; } };
+template<> struct first_letter_of_type<unsigned long> { static char value() { return 'm'; } };
+template<> struct first_letter_of_type<float> { static char value() { return 'f'; } };
+template<> struct first_letter_of_type<double> { static char value() { return 'd'; } };
+
+class kernel_generation_stream : public std::ostream
+{
+ class kgenstream : public std::stringbuf
+ {
+ public:
+ kgenstream(std::ostringstream& osstream,unsigned int const & tab_count) : oss_(osstream), tab_count_(tab_count){ }
+ int sync() {
+ for (unsigned int i=0; i<tab_count_;++i)
+ oss_ << " ";
+ oss_ << str();
+ str("");
+ return !oss_;
+ }
+#if defined(_MSC_VER)
+ ~kgenstream() throw() { pubsync(); }
+#else
+ ~kgenstream() { pubsync(); }
+#endif
+ private:
+ std::ostream& oss_;
+ unsigned int const & tab_count_;
+ };
+
+public:
+ kernel_generation_stream() : std::ostream(new kgenstream(oss,tab_count_)), tab_count_(0){ }
+#if defined(_MSC_VER)
+ ~kernel_generation_stream() throw() { delete rdbuf(); }
+#else
+ ~kernel_generation_stream(){ delete rdbuf(); }
+#endif
+
+ std::string str(){ return oss.str(); }
+ void inc_tab(){ ++tab_count_; }
+ void dec_tab(){ --tab_count_; }
+private:
+ unsigned int tab_count_;
+ std::ostringstream oss;
+};
+
+inline bool node_leaf(scheduler::op_element const & op)
+{
+ using namespace scheduler;
+ return op.type==OPERATION_UNARY_NORM_1_TYPE
+ || op.type==OPERATION_UNARY_NORM_2_TYPE
+ || op.type==OPERATION_UNARY_NORM_INF_TYPE
+ || op.type==OPERATION_UNARY_TRANS_TYPE
+ || op.type==OPERATION_BINARY_MAT_VEC_PROD_TYPE
+ || op.type==OPERATION_BINARY_MAT_MAT_PROD_TYPE
+ || op.type==OPERATION_BINARY_INNER_PROD_TYPE
+ || op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE
+ || op.type==OPERATION_BINARY_VECTOR_DIAG_TYPE
+ || op.type==OPERATION_BINARY_MATRIX_ROW_TYPE
+ || op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE
+ || op.type_family==OPERATION_VECTOR_REDUCTION_TYPE_FAMILY
+ || op.type_family==OPERATION_ROWS_REDUCTION_TYPE_FAMILY
+ || op.type_family==OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY;
+}
+
+inline bool elementwise_operator(scheduler::op_element const & op)
+{
+ using namespace scheduler;
+ return op.type== OPERATION_BINARY_ASSIGN_TYPE
+ || op.type== OPERATION_BINARY_INPLACE_ADD_TYPE
+ || op.type== OPERATION_BINARY_INPLACE_SUB_TYPE
+ || op.type== OPERATION_BINARY_ADD_TYPE
+ || op.type== OPERATION_BINARY_SUB_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_PROD_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_DIV_TYPE
+ || op.type== OPERATION_BINARY_MULT_TYPE
+ || op.type== OPERATION_BINARY_DIV_TYPE;
+}
+
+inline bool elementwise_function(scheduler::op_element const & op)
+{
+ using namespace scheduler;
+ return
+
+ op.type == OPERATION_UNARY_CAST_CHAR_TYPE
+ || op.type == OPERATION_UNARY_CAST_UCHAR_TYPE
+ || op.type == OPERATION_UNARY_CAST_SHORT_TYPE
+ || op.type == OPERATION_UNARY_CAST_USHORT_TYPE
+ || op.type == OPERATION_UNARY_CAST_INT_TYPE
+ || op.type == OPERATION_UNARY_CAST_UINT_TYPE
+ || op.type == OPERATION_UNARY_CAST_LONG_TYPE
+ || op.type == OPERATION_UNARY_CAST_ULONG_TYPE
+ || op.type == OPERATION_UNARY_CAST_HALF_TYPE
+ || op.type == OPERATION_UNARY_CAST_FLOAT_TYPE
+ || op.type == OPERATION_UNARY_CAST_DOUBLE_TYPE
+
+ || op.type== OPERATION_UNARY_ABS_TYPE
+ || op.type== OPERATION_UNARY_ACOS_TYPE
+ || op.type== OPERATION_UNARY_ASIN_TYPE
+ || op.type== OPERATION_UNARY_ATAN_TYPE
+ || op.type== OPERATION_UNARY_CEIL_TYPE
+ || op.type== OPERATION_UNARY_COS_TYPE
+ || op.type== OPERATION_UNARY_COSH_TYPE
+ || op.type== OPERATION_UNARY_EXP_TYPE
+ || op.type== OPERATION_UNARY_FABS_TYPE
+ || op.type== OPERATION_UNARY_FLOOR_TYPE
+ || op.type== OPERATION_UNARY_LOG_TYPE
+ || op.type== OPERATION_UNARY_LOG10_TYPE
+ || op.type== OPERATION_UNARY_SIN_TYPE
+ || op.type== OPERATION_UNARY_SINH_TYPE
+ || op.type== OPERATION_UNARY_SQRT_TYPE
+ || op.type== OPERATION_UNARY_TAN_TYPE
+ || op.type== OPERATION_UNARY_TANH_TYPE
+
+ || op.type== OPERATION_BINARY_ELEMENT_POW_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_EQ_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_NEQ_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_GREATER_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_LESS_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_GEQ_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_LEQ_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_FMAX_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_FMIN_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_MAX_TYPE
+ || op.type== OPERATION_BINARY_ELEMENT_MIN_TYPE;
+
+}
+
+inline scheduler::lhs_rhs_element & lhs_rhs_element(scheduler::statement const & st, vcl_size_t idx, leaf_t leaf)
+{
+ using namespace tree_parsing;
+ assert(leaf==LHS_NODE_TYPE || leaf==RHS_NODE_TYPE);
+ if (leaf==LHS_NODE_TYPE)
+ return const_cast<scheduler::lhs_rhs_element &>(st.array()[idx].lhs);
+ return const_cast<scheduler::lhs_rhs_element &>(st.array()[idx].rhs);
+}
+
+inline unsigned int size_of(scheduler::statement_node_numeric_type type)
+{
+ using namespace scheduler;
+ switch (type)
+ {
+ case UCHAR_TYPE:
+ case CHAR_TYPE: return 1;
+
+ case USHORT_TYPE:
+ case SHORT_TYPE:
+ case HALF_TYPE: return 2;
+
+ case UINT_TYPE:
+ case INT_TYPE:
+ case FLOAT_TYPE: return 4;
+
+ case ULONG_TYPE:
+ case LONG_TYPE:
+ case DOUBLE_TYPE: return 8;
+
+ default: throw generator_not_supported_exception("Unsupported scalartype");
+ }
+}
+
+inline std::string append_width(std::string const & str, unsigned int width)
+{
+ if (width==1)
+ return str;
+ return str + tools::to_string(width);
+}
+
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/ell_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/ell_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/ell_matrix.hpp
new file mode 100644
index 0000000..3c3a428
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/ell_matrix.hpp
@@ -0,0 +1,362 @@
+#ifndef VIENNACL_ELL_MATRIX_HPP_
+#define VIENNACL_ELL_MATRIX_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/ell_matrix.hpp
+ @brief Implementation of the ell_matrix class
+
+ Contributed by Volodymyr Kysenko.
+*/
+
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+/** @brief Sparse matrix class using the ELLPACK format for storing the nonzeros.
+ *
+ * This format works best for matrices where the number of nonzeros per row is mostly the same.
+ * Finite element and finite difference methods on nicely shaped domains often result in such a nonzero pattern.
+ * For a matrix
+ *
+ * (1 2 0 0 0)
+ * (2 3 4 0 0)
+ * (0 5 6 0 7)
+ * (0 0 8 9 0)
+ *
+ * the entries are layed out in chunks of size 3 as
+ * (1 2 5 8; 2 3 6 9; 0 4 7 0)
+ * Note that this is a 'transposed' representation in order to maximize coalesced memory access.
+ */
+template<typename NumericT, unsigned int AlignmentV /* see forwards.h for default argument */>
+class ell_matrix
+{
+public:
+ typedef viennacl::backend::mem_handle handle_type;
+ typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType> value_type;
+ typedef vcl_size_t size_type;
+
+ ell_matrix() : rows_(0), cols_(0), maxnnz_(0) {}
+
+ ell_matrix(viennacl::context ctx) : rows_(0), cols_(0), maxnnz_(0)
+ {
+ coords_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ coords_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ }
+
+ /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+ void clear()
+ {
+ maxnnz_ = 0;
+
+ viennacl::backend::typesafe_host_array<unsigned int> host_coords_buffer(coords_, internal_size1());
+ std::vector<NumericT> host_elements(internal_size1());
+
+ viennacl::backend::memory_create(coords_, host_coords_buffer.element_size() * internal_size1(), viennacl::traits::context(coords_), host_coords_buffer.get());
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * internal_size1(), viennacl::traits::context(elements_), &(host_elements[0]));
+ }
+
+ vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, AlignmentV); }
+ vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, AlignmentV); }
+
+ vcl_size_t size1() const { return rows_; }
+ vcl_size_t size2() const { return cols_; }
+
+ vcl_size_t internal_maxnnz() const {return viennacl::tools::align_to_multiple<vcl_size_t>(maxnnz_, AlignmentV); }
+ vcl_size_t maxnnz() const { return maxnnz_; }
+
+ vcl_size_t nnz() const { return rows_ * maxnnz_; }
+ vcl_size_t internal_nnz() const { return internal_size1() * internal_maxnnz(); }
+
+ handle_type & handle() { return elements_; }
+ const handle_type & handle() const { return elements_; }
+
+ handle_type & handle2() { return coords_; }
+ const handle_type & handle2() const { return coords_; }
+
+#if defined(_MSC_VER) && _MSC_VER < 1500 //Visual Studio 2005 needs special treatment
+ template<typename CPUMatrixT>
+ friend void copy(const CPUMatrixT & cpu_matrix, ell_matrix & gpu_matrix );
+#else
+ template<typename CPUMatrixT, typename T, unsigned int ALIGN>
+ friend void copy(const CPUMatrixT & cpu_matrix, ell_matrix<T, ALIGN> & gpu_matrix );
+#endif
+
+private:
+ vcl_size_t rows_;
+ vcl_size_t cols_;
+ vcl_size_t maxnnz_;
+
+ handle_type coords_;
+ handle_type elements_;
+};
+
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT& cpu_matrix, ell_matrix<NumericT, AlignmentV>& gpu_matrix )
+{
+ assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if (cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
+ {
+ //determine max capacity for row
+ vcl_size_t max_entries_per_row = 0;
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+ {
+ vcl_size_t num_entries = 0;
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ ++num_entries;
+
+ max_entries_per_row = std::max(max_entries_per_row, num_entries);
+ }
+
+ //setup GPU matrix
+ gpu_matrix.maxnnz_ = max_entries_per_row;
+ gpu_matrix.rows_ = cpu_matrix.size1();
+ gpu_matrix.cols_ = cpu_matrix.size2();
+
+ vcl_size_t nnz = gpu_matrix.internal_nnz();
+
+ viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), nnz);
+ std::vector<NumericT> elements(nnz, 0);
+
+ // std::cout << "ELL_MATRIX copy " << gpu_matrix.maxnnz_ << " " << gpu_matrix.rows_ << " " << gpu_matrix.cols_ << " "
+ // << gpu_matrix.internal_maxnnz() << "\n";
+
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+ {
+ vcl_size_t data_index = 0;
+
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ {
+ coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
+ elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
+ //std::cout << *col_it << "\n";
+ data_index++;
+ }
+ }
+
+ viennacl::backend::memory_create(gpu_matrix.handle2(), coords.raw_size(), traits::context(gpu_matrix.handle2()), coords.get());
+ viennacl::backend::memory_create(gpu_matrix.handle(), sizeof(NumericT) * elements.size(), traits::context(gpu_matrix.handle()), &(elements[0]));
+ }
+}
+
+
+
+/** @brief Copies a sparse matrix from the host to the compute device. The host type is the std::vector< std::map < > > format .
+ *
+ * @param cpu_matrix A sparse matrix on the host composed of an STL vector and an STL map.
+ * @param gpu_matrix The sparse ell_matrix from ViennaCL
+ */
+template<typename IndexT, typename NumericT, unsigned int AlignmentV>
+void copy(std::vector< std::map<IndexT, NumericT> > const & cpu_matrix,
+ ell_matrix<NumericT, AlignmentV> & gpu_matrix)
+{
+ vcl_size_t max_col = 0;
+ for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+ {
+ if (cpu_matrix[i].size() > 0)
+ max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+ }
+
+ viennacl::copy(tools::const_sparse_matrix_adapter<NumericT, IndexT>(cpu_matrix, cpu_matrix.size(), max_col + 1), gpu_matrix);
+}
+
+
+
+
+
+
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const ell_matrix<NumericT, AlignmentV>& gpu_matrix, CPUMatrixT& cpu_matrix)
+{
+ assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if (gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
+ {
+ std::vector<NumericT> elements(gpu_matrix.internal_nnz());
+ viennacl::backend::typesafe_host_array<unsigned int> coords(gpu_matrix.handle2(), gpu_matrix.internal_nnz());
+
+ viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT) * elements.size(), &(elements[0]));
+ viennacl::backend::memory_read(gpu_matrix.handle2(), 0, coords.raw_size(), coords.get());
+
+ for (vcl_size_t row = 0; row < gpu_matrix.size1(); row++)
+ {
+ for (vcl_size_t ind = 0; ind < gpu_matrix.internal_maxnnz(); ind++)
+ {
+ vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
+
+ NumericT val = elements[offset];
+ if (val <= 0 && val >= 0) // val == 0 without compiler warnings
+ continue;
+
+ if (coords[offset] >= gpu_matrix.size2())
+ {
+ std::cerr << "ViennaCL encountered invalid data " << offset << " " << ind << " " << row << " " << coords[offset] << " " << gpu_matrix.size2() << std::endl;
+ return;
+ }
+
+ cpu_matrix(row, coords[offset]) = val;
+ }
+ }
+ }
+}
+
+
+/** @brief Copies a sparse matrix from the compute device to the host. The host type is the std::vector< std::map < > > format .
+ *
+ * @param gpu_matrix The sparse ell_matrix from ViennaCL
+ * @param cpu_matrix A sparse matrix on the host composed of an STL vector and an STL map.
+ */
+template<typename NumericT, unsigned int AlignmentV, typename IndexT>
+void copy(const ell_matrix<NumericT, AlignmentV> & gpu_matrix,
+ std::vector< std::map<IndexT, NumericT> > & cpu_matrix)
+{
+ if (cpu_matrix.size() == 0)
+ cpu_matrix.resize(gpu_matrix.size1());
+
+ assert(cpu_matrix.size() == gpu_matrix.size1() && bool("Matrix dimension mismatch!"));
+
+ tools::sparse_matrix_adapter<NumericT, IndexT> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
+ viennacl::copy(gpu_matrix, temp);
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+ // x = A * y
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x = A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs = temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x += A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs += temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x -= A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs -= temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+ }
+ };
+
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs += temp_result;
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const ell_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs -= temp_result;
+ }
+ };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/fft.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/fft.hpp b/native-viennaCL/src/main/cpp/viennacl/fft.hpp
new file mode 100644
index 0000000..bacd911
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/fft.hpp
@@ -0,0 +1,282 @@
+#ifndef VIENNACL_FFT_HPP
+#define VIENNACL_FFT_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/fft.hpp
+ @brief All routines related to the Fast Fourier Transform. Experimental.
+ */
+
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include "viennacl/linalg/fft_operations.hpp"
+#include "viennacl/traits/handle.hpp"
+
+#include <cmath>
+
+#include <stdexcept>
+/// @cond
+namespace viennacl
+{
+namespace detail
+{
+namespace fft
+{
+ inline bool is_radix2(vcl_size_t data_size)
+ {
+ return !((data_size > 2) && (data_size & (data_size - 1)));
+ }
+} //namespace fft
+} //namespace detail
+
+/**
+ * @brief Generic inplace version of 1-D Fourier transformation.
+ *
+ * @param input Input vector, result will be stored here.
+ * @param batch_num Number of items in batch
+ * @param sign Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void inplace_fft(viennacl::vector<NumericT, AlignmentV>& input, vcl_size_t batch_num = 1,
+ NumericT sign = -1.0)
+{
+ vcl_size_t size = (input.size() >> 1) / batch_num;
+
+ if (!viennacl::detail::fft::is_radix2(size))
+ {
+ viennacl::vector<NumericT, AlignmentV> output(input.size());
+ viennacl::linalg::direct(input, output, size, size, batch_num, sign);
+ viennacl::copy(output, input);
+ }
+ else
+ viennacl::linalg::radix2(input, size, size, batch_num, sign);
+}
+
+/**
+ * @brief Generic version of 1-D Fourier transformation.
+ *
+ * @param input Input vector.
+ * @param output Output vector.
+ * @param batch_num Number of items in batch.
+ * @param sign Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void fft(viennacl::vector<NumericT, AlignmentV>& input,
+ viennacl::vector<NumericT, AlignmentV>& output, vcl_size_t batch_num = 1, NumericT sign = -1.0)
+{
+ vcl_size_t size = (input.size() >> 1) / batch_num;
+ if (viennacl::detail::fft::is_radix2(size))
+ {
+ viennacl::copy(input, output);
+ viennacl::linalg::radix2(output, size, size, batch_num, sign);
+ }
+ else
+ viennacl::linalg::direct(input, output, size, size, batch_num, sign);
+}
+
+/**
+ * @brief Generic inplace version of 2-D Fourier transformation.
+ *
+ * @param input Input matrix, result will be stored here.
+ * @param sign Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void inplace_fft(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& input,
+ NumericT sign = -1.0)
+{
+ vcl_size_t rows_num = input.size1();
+ vcl_size_t cols_num = input.size2() >> 1;
+
+ vcl_size_t cols_int = input.internal_size2() >> 1;
+
+ // batch with rows
+ if (viennacl::detail::fft::is_radix2(cols_num))
+ viennacl::linalg::radix2(input, cols_num, cols_int, rows_num, sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+ else
+ {
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> output(input.size1(),
+ input.size2());
+
+ viennacl::linalg::direct(input, output, cols_num, cols_int, rows_num, sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+
+ input = output;
+ }
+
+ // batch with cols
+ if (viennacl::detail::fft::is_radix2(rows_num))
+ viennacl::linalg::radix2(input, rows_num, cols_int, cols_num, sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+ else
+ {
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> output(input.size1(),
+ input.size2());
+
+ viennacl::linalg::direct(input, output, rows_num, cols_int, cols_num, sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+
+ input = output;
+ }
+
+}
+
+/**
+ * @brief Generic version of 2-D Fourier transformation.
+ *
+ * @param input Input vector.
+ * @param output Output vector.
+ * @param sign Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void fft(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& input, //TODO
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& output, NumericT sign = -1.0)
+{
+
+ vcl_size_t rows_num = input.size1();
+ vcl_size_t cols_num = input.size2() >> 1;
+ vcl_size_t cols_int = input.internal_size2() >> 1;
+
+ // batch with rows
+ if (viennacl::detail::fft::is_radix2(cols_num))
+ {
+ output = input;
+ viennacl::linalg::radix2(output, cols_num, cols_int, rows_num, sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+ }
+ else
+ viennacl::linalg::direct(input, output, cols_num, cols_int, rows_num, sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR);
+
+ // batch with cols
+ if (viennacl::detail::fft::is_radix2(rows_num))
+ {
+ //std::cout<<"output"<<output<<std::endl;
+
+ viennacl::linalg::radix2(output, rows_num, cols_int, cols_num, sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+ }
+ else
+ {
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> tmp(output.size1(),
+ output.size2());
+ tmp = output;
+ //std::cout<<"tmp"<<tmp<<std::endl;
+ viennacl::linalg::direct(tmp, output, rows_num, cols_int, cols_num, sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR);
+ }
+}
+
+/**
+ * @brief Generic inplace version of inverse 1-D Fourier transformation.
+ *
+ * Shorthand function for fft(sign = 1.0)
+ *
+ * @param input Input vector.
+ * @param batch_num Number of items in batch.
+ * @param sign Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void inplace_ifft(viennacl::vector<NumericT, AlignmentV>& input, vcl_size_t batch_num = 1)
+{
+ viennacl::inplace_fft(input, batch_num, NumericT(1.0));
+ viennacl::linalg::normalize(input);
+}
+
+/**
+ * @brief Generic version of inverse 1-D Fourier transformation.
+ *
+ * Shorthand function for fft(sign = 1.0)
+ *
+ * @param input Input vector.
+ * @param output Output vector.
+ * @param batch_num Number of items in batch.
+ * @param sign Sign of exponent, default is -1.0
+ */
+template<class NumericT, unsigned int AlignmentV>
+void ifft(viennacl::vector<NumericT, AlignmentV>& input,
+ viennacl::vector<NumericT, AlignmentV>& output, vcl_size_t batch_num = 1)
+{
+ viennacl::fft(input, output, batch_num, NumericT(1.0));
+ viennacl::linalg::normalize(output);
+}
+
+namespace linalg
+{
+ /**
+ * @brief 1-D convolution of two vectors.
+ *
+ * This function does not make any changes to input vectors
+ *
+ * @param input1 Input vector #1.
+ * @param input2 Input vector #2.
+ * @param output Output vector.
+ */
+ template<class NumericT, unsigned int AlignmentV>
+ void convolve(viennacl::vector<NumericT, AlignmentV>& input1,
+ viennacl::vector<NumericT, AlignmentV>& input2,
+ viennacl::vector<NumericT, AlignmentV>& output)
+ {
+ assert(input1.size() == input2.size());
+ assert(input1.size() == output.size());
+ //temporal arrays
+ viennacl::vector<NumericT, AlignmentV> tmp1(input1.size());
+ viennacl::vector<NumericT, AlignmentV> tmp2(input2.size());
+ viennacl::vector<NumericT, AlignmentV> tmp3(output.size());
+
+ // align input arrays to equal size
+ // FFT of input data
+ viennacl::fft(input1, tmp1);
+ viennacl::fft(input2, tmp2);
+
+ // multiplication of input data
+ viennacl::linalg::multiply_complex(tmp1, tmp2, tmp3);
+ // inverse FFT of input data
+ viennacl::ifft(tmp3, output);
+ }
+
+ /**
+ * @brief 1-D convolution of two vectors.
+ *
+ * This function can make changes to input vectors to avoid additional memory allocations.
+ *
+ * @param input1 Input vector #1.
+ * @param input2 Input vector #2.
+ * @param output Output vector.
+ */
+ template<class NumericT, unsigned int AlignmentV>
+ void convolve_i(viennacl::vector<NumericT, AlignmentV>& input1,
+ viennacl::vector<NumericT, AlignmentV>& input2,
+ viennacl::vector<NumericT, AlignmentV>& output)
+ {
+ assert(input1.size() == input2.size());
+ assert(input1.size() == output.size());
+
+ viennacl::inplace_fft(input1);
+ viennacl::inplace_fft(input2);
+
+ viennacl::linalg::multiply_complex(input1, input2, output);
+
+ viennacl::inplace_ifft(output);
+ }
+} //namespace linalg
+} //namespace viennacl
+
+/// @endcond
+#endif
[09/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
new file mode 100644
index 0000000..2f67a5b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
@@ -0,0 +1,405 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/coordinate_matrix.hpp
+ * @brief OpenCL kernel file for coordinate_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_coordinate_matrix_vec_mul(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void vec_mul( \n");
+ source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const uint * group_boundaries, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result, \n");
+ source.append(" "); source.append(numeric_string); source.append(" beta, \n");
+ source.append(" __local unsigned int * shared_rows, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+ source.append("{ \n");
+ source.append(" uint2 tmp; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val; \n");
+ source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
+ source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
+ source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+ source.append(" uint local_index = 0; \n");
+
+ source.append(" for (uint k = 0; k < k_end; ++k) { \n");
+ source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+ source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+ source.append(" val = (local_index < group_end) ? elements[local_index] * x[tmp.y * layout_x.y + layout_x.x] : 0; \n");
+
+ //check for carry from previous loop run:
+ source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
+ source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+ source.append(" val += inter_results[get_local_size(0)-1]; \n");
+ source.append(" else if (beta != 0) \n");
+ source.append(" result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] += alpha * inter_results[get_local_size(0)-1]; \n");
+ source.append(" else \n");
+ source.append(" result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] = alpha * inter_results[get_local_size(0)-1]; \n");
+ source.append(" } \n");
+
+ //segmented parallel reduction begin
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
+ source.append(" inter_results[get_local_id(0)] = val; \n");
+ source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+ source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" inter_results[get_local_id(0)] += left; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ //segmented parallel reduction end
+
+ source.append(" if (local_index < group_end - 1 && get_local_id(0) < get_local_size(0) - 1 && \n");
+ source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+ source.append(" if (beta != 0) result[tmp.x * layout_result.y + layout_result.x] += alpha * inter_results[get_local_id(0)]; \n");
+ source.append(" else result[tmp.x * layout_result.y + layout_result.x] = alpha * inter_results[get_local_id(0)]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n"); //for k
+
+ source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
+ source.append(" if (beta != 0) result[tmp.x * layout_result.y + layout_result.x] += alpha * inter_results[get_local_id(0)]; \n");
+ source.append(" else result[tmp.x * layout_result.y + layout_result.x] = alpha * inter_results[get_local_id(0)]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+namespace detail
+{
+ /** @brief Generate kernel for C = A * B with A being a compressed_matrix, B and C dense */
+ template<typename StringT>
+ void generate_coordinate_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
+ bool B_transposed, bool B_row_major, bool C_row_major)
+ {
+ source.append("__kernel void ");
+ source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+ source.append("( \n");
+ source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const uint * group_boundaries, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
+ source.append(" unsigned int d_mat_row_start, \n");
+ source.append(" unsigned int d_mat_col_start, \n");
+ source.append(" unsigned int d_mat_row_inc, \n");
+ source.append(" unsigned int d_mat_col_inc, \n");
+ source.append(" unsigned int d_mat_row_size, \n");
+ source.append(" unsigned int d_mat_col_size, \n");
+ source.append(" unsigned int d_mat_internal_rows, \n");
+ source.append(" unsigned int d_mat_internal_cols, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int result_row_start, \n");
+ source.append(" unsigned int result_col_start, \n");
+ source.append(" unsigned int result_row_inc, \n");
+ source.append(" unsigned int result_col_inc, \n");
+ source.append(" unsigned int result_row_size, \n");
+ source.append(" unsigned int result_col_size, \n");
+ source.append(" unsigned int result_internal_rows, \n");
+ source.append(" unsigned int result_internal_cols, \n");
+ source.append(" __local unsigned int * shared_rows, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+ source.append("{ \n");
+ source.append(" uint2 tmp; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val; \n");
+ source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
+ source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
+ source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+ source.append(" uint local_index = 0; \n");
+
+ source.append(" for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
+ source.append(" for (uint k = 0; k < k_end; ++k) { \n");
+ source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+ source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+ if (B_transposed && B_row_major)
+ source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + tmp.y * d_mat_col_inc ] : 0; \n");
+ else if (B_transposed && !B_row_major)
+ source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + tmp.y * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
+ else if (!B_transposed && B_row_major)
+ source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] : 0; \n");
+ else
+ source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
+
+ //check for carry from previous loop run:
+ source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
+ source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+ source.append(" val += inter_results[get_local_size(0)-1]; \n");
+ source.append(" else \n");
+ if (C_row_major)
+ source.append(" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_size(0)-1]; \n");
+ else
+ source.append(" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_size(0)-1]; \n");
+ source.append(" } \n");
+
+ //segmented parallel reduction begin
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
+ source.append(" inter_results[get_local_id(0)] = val; \n");
+ source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+ source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" inter_results[get_local_id(0)] += left; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ //segmented parallel reduction end
+
+ source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
+ source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+ if (C_row_major)
+ source.append(" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
+ else
+ source.append(" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n"); //for k
+
+ source.append(" if (local_index + 1 == group_end) \n"); //write results of last active entry (this may not necessarily be the case already)
+ if (C_row_major)
+ source.append(" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
+ else
+ source.append(" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
+ source.append(" } \n"); //for result_col
+ source.append("} \n");
+
+ }
+}
+
+template<typename StringT>
+void generate_coordinate_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+ detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+ detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, true);
+ detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, true, false);
+ detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, true, true);
+
+ detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+ detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, true);
+ detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, true, false);
+ detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, true, true);
+}
+
+template<typename StringT>
+void generate_coordinate_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void row_info_extractor( \n");
+ source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const uint * group_boundaries, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int option, \n");
+ source.append(" __local unsigned int * shared_rows, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
+ source.append("{ \n");
+ source.append(" uint2 tmp; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val; \n");
+ source.append(" uint last_index = get_local_size(0) - 1; \n");
+ source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
+ source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
+ source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : ("); source.append(numeric_string); source.append(")0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+ source.append(" uint local_index = 0; \n");
+
+ source.append(" for (uint k = 0; k < k_end; ++k) \n");
+ source.append(" { \n");
+ source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+ source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+ source.append(" val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; \n");
+
+ //check for carry from previous loop run:
+ source.append(" if (get_local_id(0) == 0 && k > 0) \n");
+ source.append(" { \n");
+ source.append(" if (tmp.x == shared_rows[last_index]) \n");
+ source.append(" { \n");
+ source.append(" switch (option) \n");
+ source.append(" { \n");
+ source.append(" case 0: \n"); //inf-norm
+ source.append(" case 3: \n"); //diagonal entry
+ source.append(" val = max(val, fabs(inter_results[last_index])); \n");
+ source.append(" break; \n");
+
+ source.append(" case 1: \n"); //1-norm
+ source.append(" val = fabs(val) + inter_results[last_index]; \n");
+ source.append(" break; \n");
+
+ source.append(" case 2: \n"); //2-norm
+ source.append(" val = sqrt(val * val + inter_results[last_index]); \n");
+ source.append(" break; \n");
+
+ source.append(" default: \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" switch (option) \n");
+ source.append(" { \n");
+ source.append(" case 0: \n"); //inf-norm
+ source.append(" case 1: \n"); //1-norm
+ source.append(" case 3: \n"); //diagonal entry
+ source.append(" result[shared_rows[last_index]] = inter_results[last_index]; \n");
+ source.append(" break; \n");
+
+ source.append(" case 2: \n"); //2-norm
+ source.append(" result[shared_rows[last_index]] = sqrt(inter_results[last_index]); \n");
+ source.append(" default: \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ //segmented parallel reduction begin
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
+ source.append(" switch (option) \n");
+ source.append(" { \n");
+ source.append(" case 0: \n");
+ source.append(" case 3: \n");
+ source.append(" inter_results[get_local_id(0)] = val; \n");
+ source.append(" break; \n");
+ source.append(" case 1: \n");
+ source.append(" inter_results[get_local_id(0)] = fabs(val); \n");
+ source.append(" break; \n");
+ source.append(" case 2: \n");
+ source.append(" inter_results[get_local_id(0)] = val * val; \n");
+ source.append(" default: \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : ("); source.append(numeric_string); source.append(")0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" switch (option) \n");
+ source.append(" { \n");
+ source.append(" case 0: \n"); //inf-norm
+ source.append(" case 3: \n"); //diagonal entry
+ source.append(" inter_results[get_local_id(0)] = max(inter_results[get_local_id(0)], left); \n");
+ source.append(" break; \n");
+
+ source.append(" case 1: \n"); //1-norm
+ source.append(" inter_results[get_local_id(0)] += left; \n");
+ source.append(" break; \n");
+
+ source.append(" case 2: \n"); //2-norm
+ source.append(" inter_results[get_local_id(0)] += left; \n");
+ source.append(" break; \n");
+
+ source.append(" default: \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ //segmented parallel reduction end
+
+ source.append(" if (get_local_id(0) != last_index && \n");
+ source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] && \n");
+ source.append(" inter_results[get_local_id(0)] != 0) \n");
+ source.append(" { \n");
+ source.append(" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n"); //for k
+
+ source.append(" if (local_index + 1 == group_end && inter_results[get_local_id(0)] != 0) \n");
+ source.append(" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
+ source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for coordinate_matrix. */
+template<typename NumericT>
+struct coordinate_matrix
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_coordinate_matrix";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ generate_coordinate_matrix_vec_mul(source, numeric_string);
+ generate_coordinate_matrix_dense_matrix_multiplication(source, numeric_string);
+ generate_coordinate_matrix_row_info_extractor(source, numeric_string);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ell_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ell_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ell_matrix.hpp
new file mode 100644
index 0000000..23c6af9
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ell_matrix.hpp
@@ -0,0 +1,221 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_ELL_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_ELL_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/ell_matrix.hpp
+ * @brief OpenCL kernel file for ell_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_ell_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+ if (with_alpha_beta)
+ source.append("__kernel void vec_mul_alpha_beta( \n");
+ else
+ source.append("__kernel void vec_mul( \n");
+ source.append(" __global const unsigned int * coords, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" beta, \n"); }
+ source.append(" unsigned int row_num, \n");
+ source.append(" unsigned int col_num, \n");
+ source.append(" unsigned int internal_row_num, \n");
+ source.append(" unsigned int items_per_row, \n");
+ source.append(" unsigned int aligned_items_per_row) \n");
+ source.append("{ \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint offset = row_id; \n");
+ source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
+
+ source.append(" if (val != 0.0f) { \n");
+ source.append(" int col = coords[offset]; \n");
+ source.append(" sum += (x[col * layout_x.y + layout_x.x] * val); \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+
+ if (with_alpha_beta)
+ source.append(" result[row_id * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row_id * layout_result.y + layout_result.x] : 0); \n");
+ else
+ source.append(" result[row_id * layout_result.y + layout_result.x] = sum; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+namespace detail
+{
+ template<typename StringT>
+ void generate_ell_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
+ bool B_transposed, bool B_row_major, bool C_row_major)
+ {
+ source.append("__kernel void ");
+ source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+ source.append("( \n");
+ source.append(" __global const unsigned int * sp_mat_coords, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * sp_mat_elems, \n");
+ source.append(" unsigned int sp_mat_row_num, \n");
+ source.append(" unsigned int sp_mat_col_num, \n");
+ source.append(" unsigned int sp_mat_internal_row_num, \n");
+ source.append(" unsigned int sp_mat_items_per_row, \n");
+ source.append(" unsigned int sp_mat_aligned_items_per_row, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append("* d_mat, \n");
+ source.append(" unsigned int d_mat_row_start, \n");
+ source.append(" unsigned int d_mat_col_start, \n");
+ source.append(" unsigned int d_mat_row_inc, \n");
+ source.append(" unsigned int d_mat_col_inc, \n");
+ source.append(" unsigned int d_mat_row_size, \n");
+ source.append(" unsigned int d_mat_col_size, \n");
+ source.append(" unsigned int d_mat_internal_rows, \n");
+ source.append(" unsigned int d_mat_internal_cols, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int result_row_start, \n");
+ source.append(" unsigned int result_col_start, \n");
+ source.append(" unsigned int result_row_inc, \n");
+ source.append(" unsigned int result_col_inc, \n");
+ source.append(" unsigned int result_row_size, \n");
+ source.append(" unsigned int result_col_size, \n");
+ source.append(" unsigned int result_internal_rows, \n");
+ source.append(" unsigned int result_internal_cols) { \n");
+
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for ( uint rc = glb_id; rc < (sp_mat_row_num * result_col_size); rc += glb_sz) { \n");
+ source.append(" uint row = rc % sp_mat_row_num; \n");
+ source.append(" uint col = rc / sp_mat_row_num; \n");
+
+ source.append(" uint offset = row; \n");
+ source.append(" "); source.append(numeric_string); source.append(" r = ("); source.append(numeric_string); source.append(")0; \n");
+
+ source.append(" for ( uint k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) { \n");
+
+ source.append(" uint j = sp_mat_coords[offset]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" x = sp_mat_elems[offset]; \n");
+
+ source.append(" if (x != ("); source.append(numeric_string); source.append(")0) { \n");
+ source.append(" "); source.append(numeric_string);
+ if (B_transposed && B_row_major)
+ source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
+ else if (B_transposed && !B_row_major)
+ source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+ else if (!B_transposed && B_row_major)
+ source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
+ else
+ source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+
+ source.append(" r += x*y; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ if (C_row_major)
+ source.append(" result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
+ else
+ source.append(" result[ (result_row_start + row * result_row_inc) + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ }
+}
+
+template<typename StringT>
+void generate_ell_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+ detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+ detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false, false, true);
+ detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false, true, false);
+ detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, false, true, true);
+
+ detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+ detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true, false, true);
+ detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true, true, false);
+ detail::generate_ell_matrix_dense_matrix_mul(source, numeric_string, true, true, true);
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for ell_matrix. */
+template<typename NumericT>
+struct ell_matrix
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_ell_matrix";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // fully parameterized kernels:
+ generate_ell_vec_mul(source, numeric_string, true);
+ generate_ell_vec_mul(source, numeric_string, false);
+ generate_ell_matrix_dense_matrix_multiplication(source, numeric_string);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/fft.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/fft.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/fft.hpp
new file mode 100644
index 0000000..1447bd1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/fft.hpp
@@ -0,0 +1,311 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_FFT_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/fft.hpp
+ * @brief OpenCL kernel file for FFT operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+// Postprocessing phase of Bluestein algorithm
+template<typename StringT>
+void generate_fft_bluestein_post(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bluestein_post(__global "); source.append(numeric_string); source.append("2 *Z, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("2 *out, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" unsigned int glb_id = get_global_id(0); \n");
+ source.append(" unsigned int glb_sz = get_global_size(0); \n");
+
+ source.append(" unsigned int double_size = size << 1; \n");
+ source.append(" "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
+ source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+ source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+ source.append(" unsigned int rm = i * i % (double_size); \n");
+ source.append(" "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * (-NUM_PI); \n");
+
+ source.append(" sn_a = sincos(angle, &cs_a); \n");
+
+ source.append(" "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
+ source.append(" out[i] = ("); source.append(numeric_string); source.append("2)(Z[i].x * b_i.x - Z[i].y * b_i.y, Z[i].x * b_i.y + Z[i].y * b_i.x); \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+// Preprocessing phase of Bluestein algorithm
+template<typename StringT>
+void generate_fft_bluestein_pre(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bluestein_pre(__global "); source.append(numeric_string); source.append("2 *input, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("2 *A, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("2 *B, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int ext_size \n");
+ source.append(" ) { \n");
+ source.append(" unsigned int glb_id = get_global_id(0); \n");
+ source.append(" unsigned int glb_sz = get_global_size(0); \n");
+
+ source.append(" unsigned int double_size = size << 1; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" sn_a, cs_a; \n");
+ source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+ source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+ source.append(" unsigned int rm = i * i % (double_size); \n");
+ source.append(" "); source.append(numeric_string); source.append(" angle = ("); source.append(numeric_string); source.append(")rm / size * NUM_PI; \n");
+
+ source.append(" sn_a = sincos(-angle, &cs_a); \n");
+
+ source.append(" "); source.append(numeric_string); source.append("2 a_i = ("); source.append(numeric_string); source.append("2)(cs_a, sn_a); \n");
+ source.append(" "); source.append(numeric_string); source.append("2 b_i = ("); source.append(numeric_string); source.append("2)(cs_a, -sn_a); \n");
+
+ source.append(" A[i] = ("); source.append(numeric_string); source.append("2)(input[i].x * a_i.x - input[i].y * a_i.y, input[i].x * a_i.y + input[i].y * a_i.x); \n");
+ source.append(" B[i] = b_i; \n");
+
+ // very bad instruction, to be fixed
+ source.append(" if (i) \n");
+ source.append(" B[ext_size - i] = b_i; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+/** @brief Extract real part of a complex number array */
+template<typename StringT>
+void generate_fft_complex_to_real(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void complex_to_real(__global "); source.append(numeric_string); source.append("2 *in, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *out, \n");
+ source.append(" unsigned int size) { \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append(" out[i] = in[i].x; \n");
+ source.append("} \n");
+}
+
+/** @brief OpenCL kernel generation code for dividing a complex number by a real number */
+template<typename StringT>
+void generate_fft_div_vec_scalar(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void fft_div_vec_scalar(__global "); source.append(numeric_string); source.append("2 *input1, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" "); source.append(numeric_string); source.append(" factor) { \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append(" input1[i] /= factor; \n");
+ source.append("} \n");
+}
+
+/** @brief Elementwise product of two complex vectors */
+template<typename StringT>
+void generate_fft_mult_vec(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void fft_mult_vec(__global const "); source.append(numeric_string); source.append("2 *input1, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append("2 *input2, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
+ source.append(" unsigned int size) { \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append("2 in1 = input1[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append("2 in2 = input2[i]; \n");
+
+ source.append(" output[i] = ("); source.append(numeric_string); source.append("2)(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+/** @brief Embedds a real-valued vector into a complex one */
+template<typename StringT>
+void generate_fft_real_to_complex(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void real_to_complex(__global "); source.append(numeric_string); source.append(" *in, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("2 *out, \n");
+ source.append(" unsigned int size) { \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append("2 val = 0; \n");
+ source.append(" val.x = in[i]; \n");
+ source.append(" out[i] = val; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+/** @brief Reverses the entries in a vector */
+template<typename StringT>
+void generate_fft_reverse_inplace(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void reverse_inplace(__global "); source.append(numeric_string); source.append(" *vec, uint size) { \n");
+ source.append(" for (uint i = get_global_id(0); i < (size >> 1); i+=get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val1 = vec[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val2 = vec[size - i - 1]; \n");
+
+ source.append(" vec[i] = val2; \n");
+ source.append(" vec[size - i - 1] = val1; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+/** @brief Simplistic matrix transpose function */
+template<typename StringT>
+void generate_fft_transpose(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void transpose(__global "); source.append(numeric_string); source.append("2 *input, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
+ source.append(" unsigned int row_num, \n");
+ source.append(" unsigned int col_num) { \n");
+ source.append(" unsigned int size = row_num * col_num; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+ source.append(" unsigned int row = i / col_num; \n");
+ source.append(" unsigned int col = i - row*col_num; \n");
+
+ source.append(" unsigned int new_pos = col * row_num + row; \n");
+
+ source.append(" output[new_pos] = input[i]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+/** @brief Simplistic inplace matrix transpose function */
+template<typename StringT>
+void generate_fft_transpose_inplace(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("2* input, \n");
+ source.append(" unsigned int row_num, \n");
+ source.append(" unsigned int col_num) { \n");
+ source.append(" unsigned int size = row_num * col_num; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+ source.append(" unsigned int row = i / col_num; \n");
+ source.append(" unsigned int col = i - row*col_num; \n");
+
+ source.append(" unsigned int new_pos = col * row_num + row; \n");
+
+ source.append(" if (i < new_pos) { \n");
+ source.append(" "); source.append(numeric_string); source.append("2 val = input[i]; \n");
+ source.append(" input[i] = input[new_pos]; \n");
+ source.append(" input[new_pos] = val; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+/** @brief Computes the matrix vector product with a Vandermonde matrix */
+template<typename StringT>
+void generate_fft_vandermonde_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void vandermonde_prod(__global "); source.append(numeric_string); source.append(" *vander, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *vector, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *result, \n");
+ source.append(" uint size) { \n");
+ source.append(" for (uint i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" mul = vander[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" pwr = 1; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = 0; \n");
+
+ source.append(" for (uint j = 0; j < size; j++) { \n");
+ source.append(" val = val + pwr * vector[j]; \n");
+ source.append(" pwr *= mul; \n");
+ source.append(" } \n");
+
+ source.append(" result[i] = val; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+/** @brief Zero two complex vectors (to avoid kernel launch overhead) */
+template<typename StringT>
+void generate_fft_zero2(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void zero2(__global "); source.append(numeric_string); source.append("2 *input1, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("2 *input2, \n");
+ source.append(" unsigned int size) { \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" input1[i] = 0; \n");
+ source.append(" input2[i] = 0; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for the fast Fourier transform. */
+template<typename NumericT>
+struct fft
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_fft";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // unary operations
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ generate_fft_bluestein_post(source, numeric_string);
+ generate_fft_bluestein_pre(source, numeric_string);
+ generate_fft_complex_to_real(source, numeric_string);
+ generate_fft_div_vec_scalar(source, numeric_string);
+ generate_fft_mult_vec(source, numeric_string);
+ generate_fft_real_to_complex(source, numeric_string);
+ generate_fft_reverse_inplace(source, numeric_string);
+ generate_fft_transpose(source, numeric_string);
+ generate_fft_transpose_inplace(source, numeric_string);
+ generate_fft_vandermonde_prod(source, numeric_string);
+ generate_fft_zero2(source, numeric_string);
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/hyb_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/hyb_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/hyb_matrix.hpp
new file mode 100644
index 0000000..83d1411
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/hyb_matrix.hpp
@@ -0,0 +1,240 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_HYB_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/hyb_matrix.hpp
+ * @brief OpenCL kernel file for hyb_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_hyb_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+ if (with_alpha_beta)
+ source.append("__kernel void vec_mul_alpha_beta( \n");
+ else
+ source.append("__kernel void vec_mul( \n");
+ source.append(" const __global int* ell_coords, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+ source.append(" const __global uint* csr_rows, \n");
+ source.append(" const __global uint* csr_cols, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" beta, \n"); }
+ source.append(" unsigned int row_num, \n");
+ source.append(" unsigned int internal_row_num, \n");
+ source.append(" unsigned int items_per_row, \n");
+ source.append(" unsigned int aligned_items_per_row) \n");
+ source.append("{ \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint offset = row_id; \n");
+ source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+
+ source.append(" if (val != ("); source.append(numeric_string); source.append(")0) { \n");
+ source.append(" int col = ell_coords[offset]; \n");
+ source.append(" sum += (x[col * layout_x.y + layout_x.x] * val); \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+
+ source.append(" uint col_begin = csr_rows[row_id]; \n");
+ source.append(" uint col_end = csr_rows[row_id + 1]; \n");
+
+ source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
+ source.append(" sum += (x[csr_cols[item_id] * layout_x.y + layout_x.x] * csr_elements[item_id]); \n");
+ source.append(" } \n");
+
+ if (with_alpha_beta)
+ source.append(" result[row_id * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row_id * layout_result.y + layout_result.x] : 0); \n");
+ else
+ source.append(" result[row_id * layout_result.y + layout_result.x] = sum; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+namespace detail
+{
+ template<typename StringT>
+ void generate_hyb_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
+ bool B_transposed, bool B_row_major, bool C_row_major)
+ {
+ source.append("__kernel void ");
+ source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+ source.append("( \n");
+ source.append(" const __global int* ell_coords, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+ source.append(" const __global uint* csr_rows, \n");
+ source.append(" const __global uint* csr_cols, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+ source.append(" unsigned int row_num, \n");
+ source.append(" unsigned int internal_row_num, \n");
+ source.append(" unsigned int items_per_row, \n");
+ source.append(" unsigned int aligned_items_per_row, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append("* d_mat, \n");
+ source.append(" unsigned int d_mat_row_start, \n");
+ source.append(" unsigned int d_mat_col_start, \n");
+ source.append(" unsigned int d_mat_row_inc, \n");
+ source.append(" unsigned int d_mat_col_inc, \n");
+ source.append(" unsigned int d_mat_row_size, \n");
+ source.append(" unsigned int d_mat_col_size, \n");
+ source.append(" unsigned int d_mat_internal_rows, \n");
+ source.append(" unsigned int d_mat_internal_cols, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int result_row_start, \n");
+ source.append(" unsigned int result_col_start, \n");
+ source.append(" unsigned int result_row_inc, \n");
+ source.append(" unsigned int result_col_inc, \n");
+ source.append(" unsigned int result_row_size, \n");
+ source.append(" unsigned int result_col_size, \n");
+ source.append(" unsigned int result_internal_rows, \n");
+ source.append(" unsigned int result_internal_cols) { \n");
+
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
+ source.append(" for (uint row_id = glb_id; row_id < row_num; row_id += glb_sz) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint offset = row_id; \n");
+ source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+
+ source.append(" if (val != ("); source.append(numeric_string); source.append(")0) { \n");
+ source.append(" int col = ell_coords[offset]; \n");
+ if (B_transposed && B_row_major)
+ source.append(" sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ] * val; \n");
+ else if (B_transposed && !B_row_major)
+ source.append(" sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
+ else if (!B_transposed && B_row_major)
+ source.append(" sum += d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] * val; \n");
+ else
+ source.append(" sum += d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] * val; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+
+ source.append(" uint col_begin = csr_rows[row_id]; \n");
+ source.append(" uint col_end = csr_rows[row_id + 1]; \n");
+
+ source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
+ if (B_transposed && B_row_major)
+ source.append(" sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + csr_cols[item_id] * d_mat_col_inc ] * csr_elements[item_id]; \n");
+ else if (B_transposed && !B_row_major)
+ source.append(" sum += d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + csr_cols[item_id] * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
+ else if (!B_transposed && B_row_major)
+ source.append(" sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] * csr_elements[item_id]; \n");
+ else
+ source.append(" sum += d_mat[ (d_mat_row_start + csr_cols[item_id] * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] * csr_elements[item_id]; \n");
+ source.append(" } \n");
+
+ if (C_row_major)
+ source.append(" result[ (result_row_start + row_id * result_row_inc) * result_internal_cols + result_col_start + result_col * result_col_inc ] = sum; \n");
+ else
+ source.append(" result[ (result_row_start + row_id * result_row_inc) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = sum; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+ }
+}
+
+template<typename StringT>
+void generate_hyb_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+ detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
+ detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false, false, true);
+ detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false, true, false);
+ detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, false, true, true);
+
+ detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
+ detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true, false, true);
+ detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true, true, false);
+ detail::generate_hyb_matrix_dense_matrix_mul(source, numeric_string, true, true, true);
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for hyb_matrix. */
+template<typename NumericT>
+struct hyb_matrix
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_hyb_matrix";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ generate_hyb_vec_mul(source, numeric_string, true);
+ generate_hyb_vec_mul(source, numeric_string, false);
+ generate_hyb_matrix_dense_matrix_multiplication(source, numeric_string);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ilu.hpp
new file mode 100644
index 0000000..bef778c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/ilu.hpp
@@ -0,0 +1,505 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_ILU_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/ilu.hpp
+ * @brief OpenCL kernel file for nonnegative matrix factorization */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+template<typename StringT>
+void generate_ilu_level_scheduling_substitute(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void level_scheduling_substitute( \n");
+ source.append(" __global const unsigned int * row_index_array, \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < size; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int eq_row = row_index_array[row]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" vec_entry = vec[eq_row]; \n");
+ source.append(" unsigned int row_end = row_indices[row+1]; \n");
+
+ source.append(" for (unsigned int j = row_indices[row]; j < row_end; ++j) \n");
+ source.append(" vec_entry -= vec[column_indices[j]] * elements[j]; \n");
+
+ source.append(" vec[eq_row] = vec_entry; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+///////////// ICC ///////////////
+
+
+template<typename StringT>
+void generate_icc_extract_L_1(StringT & source)
+{
+ source.append("__kernel void extract_L_1( \n");
+ source.append(" __global unsigned int const *A_row_indices, \n");
+ source.append(" __global unsigned int const *A_col_indices, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" __global unsigned int *L_row_indices) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < A_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_begin = A_row_indices[row]; \n");
+ source.append(" unsigned int row_end = A_row_indices[row+1]; \n");
+
+ source.append(" unsigned int num_entries_L = 0; \n");
+ source.append(" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+ source.append(" unsigned int col = A_col_indices[j]; \n");
+ source.append(" if (col <= row) ++num_entries_L; \n");
+ source.append(" } \n");
+
+ source.append(" L_row_indices[row] = num_entries_L; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_icc_extract_L_2(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void extract_L_2( \n");
+ source.append(" __global unsigned int const *A_row_indices, \n");
+ source.append(" __global unsigned int const *A_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" __global unsigned int const *L_row_indices, \n");
+ source.append(" __global unsigned int *L_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *L_elements) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < A_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_begin = A_row_indices[row]; \n");
+ source.append(" unsigned int row_end = A_row_indices[row+1]; \n");
+
+ source.append(" unsigned int index_L = L_row_indices[row]; \n");
+ source.append(" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+ source.append(" unsigned int col = A_col_indices[j]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value = A_elements[j]; \n");
+
+ source.append(" if (col <= row) { \n");
+ source.append(" L_col_indices[index_L] = col; \n");
+ source.append(" L_elements[index_L] = value; \n");
+ source.append(" ++index_L; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+template<typename StringT>
+void generate_icc_chow_patel_sweep_kernel(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void icc_chow_patel_sweep_kernel( \n");
+ source.append(" __global unsigned int const *L_row_indices, \n");
+ source.append(" __global unsigned int const *L_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *L_elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *L_backup, \n");
+ source.append(" unsigned int L_size1, \n");
+
+ source.append(" __global "); source.append(numeric_string); source.append(" const *aij_L) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < L_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+
+ //
+ // Update L:
+ //
+ source.append(" unsigned int row_Li_start = L_row_indices[row]; \n");
+ source.append(" unsigned int row_Li_end = L_row_indices[row + 1]; \n");
+
+ source.append(" for (unsigned int i = row_Li_start; i < row_Li_end; ++i) { \n");
+ source.append(" unsigned int col = L_col_indices[i]; \n");
+
+ source.append(" unsigned int row_Lj_start = L_row_indices[col]; \n");
+ source.append(" unsigned int row_Lj_end = L_row_indices[col + 1]; \n");
+
+ source.append(" unsigned int index_Lj = row_Lj_start; \n");
+ source.append(" unsigned int col_Lj = L_col_indices[index_Lj]; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" s = aij_L[i]; \n");
+ source.append(" for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) { \n");
+ source.append(" unsigned int col_Li = L_col_indices[index_Li]; \n");
+
+ source.append(" while (col_Lj < col_Li) { \n");
+ source.append(" ++index_Lj; \n");
+ source.append(" col_Lj = L_col_indices[index_Lj]; \n");
+ source.append(" } \n");
+
+ source.append(" if (col_Lj == col_Li) \n");
+ source.append(" s -= L_backup[index_Li] * L_backup[index_Lj]; \n");
+ source.append(" } \n");
+
+ // update l_ij:
+ source.append(" L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+///////////// ILU ///////////////
+
+template<typename StringT>
+void generate_ilu_extract_LU_1(StringT & source)
+{
+ source.append("__kernel void extract_LU_1( \n");
+ source.append(" __global unsigned int const *A_row_indices, \n");
+ source.append(" __global unsigned int const *A_col_indices, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" __global unsigned int *L_row_indices, \n");
+ source.append(" __global unsigned int *U_row_indices) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < A_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_begin = A_row_indices[row]; \n");
+ source.append(" unsigned int row_end = A_row_indices[row+1]; \n");
+
+ source.append(" unsigned int num_entries_L = 0; \n");
+ source.append(" unsigned int num_entries_U = 0; \n");
+ source.append(" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+ source.append(" unsigned int col = A_col_indices[j]; \n");
+ source.append(" if (col <= row) ++num_entries_L; \n");
+ source.append(" if (col >= row) ++num_entries_U; \n");
+ source.append(" } \n");
+
+ source.append(" L_row_indices[row] = num_entries_L; \n");
+ source.append(" U_row_indices[row] = num_entries_U; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_ilu_extract_LU_2(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void extract_LU_2( \n");
+ source.append(" __global unsigned int const *A_row_indices, \n");
+ source.append(" __global unsigned int const *A_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" __global unsigned int const *L_row_indices, \n");
+ source.append(" __global unsigned int *L_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *L_elements, \n");
+ source.append(" __global unsigned int const *U_row_indices, \n");
+ source.append(" __global unsigned int *U_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *U_elements) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < A_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_begin = A_row_indices[row]; \n");
+ source.append(" unsigned int row_end = A_row_indices[row+1]; \n");
+
+ source.append(" unsigned int index_L = L_row_indices[row]; \n");
+ source.append(" unsigned int index_U = U_row_indices[row]; \n");
+ source.append(" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+ source.append(" unsigned int col = A_col_indices[j]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value = A_elements[j]; \n");
+
+ source.append(" if (col <= row) { \n");
+ source.append(" L_col_indices[index_L] = col; \n");
+ source.append(" L_elements[index_L] = value; \n");
+ source.append(" ++index_L; \n");
+ source.append(" } \n");
+ source.append(" if (col >= row) { \n");
+ source.append(" U_col_indices[index_U] = col; \n");
+ source.append(" U_elements[index_U] = value; \n");
+ source.append(" ++index_U; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_ilu_scale_kernel_1(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void ilu_scale_kernel_1( \n");
+ source.append(" __global unsigned int const *A_row_indices, \n");
+ source.append(" __global unsigned int const *A_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *D_elements) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < A_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_begin = A_row_indices[row]; \n");
+ source.append(" unsigned int row_end = A_row_indices[row+1]; \n");
+
+ source.append(" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+ source.append(" unsigned int col = A_col_indices[j]; \n");
+
+ source.append(" if (col == row) { \n");
+ source.append(" D_elements[row] = 1 / sqrt(fabs(A_elements[j])); \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_ilu_scale_kernel_2(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void ilu_scale_kernel_2( \n");
+ source.append(" __global unsigned int const *R_row_indices, \n");
+ source.append(" __global unsigned int const *R_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *R_elements, \n");
+ source.append(" unsigned int R_size1, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *D_elements) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < R_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_begin = R_row_indices[row]; \n");
+ source.append(" unsigned int row_end = R_row_indices[row+1]; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" D_row = D_elements[row]; \n");
+ source.append(" for (unsigned int j=row_begin; j<row_end; ++j) \n");
+ source.append(" R_elements[j] *= D_row * D_elements[R_col_indices[j]]; \n");
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_ilu_chow_patel_sweep_kernel(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void ilu_chow_patel_sweep_kernel( \n");
+ source.append(" __global unsigned int const *L_row_indices, \n");
+ source.append(" __global unsigned int const *L_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *L_elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *L_backup, \n");
+ source.append(" unsigned int L_size1, \n");
+
+ source.append(" __global "); source.append(numeric_string); source.append(" const *aij_L, \n");
+
+ source.append(" __global unsigned int const *U_trans_row_indices, \n");
+ source.append(" __global unsigned int const *U_trans_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *U_trans_elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *U_trans_backup, \n");
+
+ source.append(" __global "); source.append(numeric_string); source.append(" const *aij_U_trans) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < L_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+
+ //
+ // Update L:
+ //
+ source.append(" unsigned int row_L_start = L_row_indices[row]; \n");
+ source.append(" unsigned int row_L_end = L_row_indices[row + 1]; \n");
+
+ source.append(" for (unsigned int j = row_L_start; j < row_L_end; ++j) { \n");
+ source.append(" unsigned int col = L_col_indices[j]; \n");
+
+ source.append(" if (col == row) continue; \n");
+
+ source.append(" unsigned int row_U_start = U_trans_row_indices[col]; \n");
+ source.append(" unsigned int row_U_end = U_trans_row_indices[col + 1]; \n");
+
+ source.append(" unsigned int index_U = row_U_start; \n");
+ source.append(" unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+ source.append(" for (unsigned int k = row_L_start; k < j; ++k) { \n");
+ source.append(" unsigned int col_L = L_col_indices[k]; \n");
+
+ source.append(" while (col_U < col_L) { \n");
+ source.append(" ++index_U; \n");
+ source.append(" col_U = U_trans_col_indices[index_U]; \n");
+ source.append(" } \n");
+
+ source.append(" if (col_U == col_L) \n");
+ source.append(" sum += L_backup[k] * U_trans_backup[index_U]; \n");
+ source.append(" } \n");
+
+ // update l_ij:
+ source.append(" L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; \n");
+ source.append(" } \n");
+
+ //
+ // Update U:
+ //
+ source.append(" unsigned int row_U_start = U_trans_row_indices[row]; \n");
+ source.append(" unsigned int row_U_end = U_trans_row_indices[row + 1]; \n");
+
+ source.append(" for (unsigned int j = row_U_start; j < row_U_end; ++j) { \n");
+ source.append(" unsigned int col = U_trans_col_indices[j]; \n");
+
+ source.append(" row_L_start = L_row_indices[col]; \n");
+ source.append(" row_L_end = L_row_indices[col + 1]; \n");
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ source.append(" unsigned int index_L = row_L_start; \n");
+ source.append(" unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1; \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+ source.append(" for (unsigned int k = row_U_start; k < j; ++k) { \n");
+ source.append(" unsigned int col_U = U_trans_col_indices[k]; \n");
+
+ // find element in L:
+ source.append(" while (col_L < col_U) { \n");
+ source.append(" ++index_L; \n");
+ source.append(" col_L = L_col_indices[index_L]; \n");
+ source.append(" } \n");
+
+ source.append(" if (col_U == col_L) \n");
+ source.append(" sum += L_backup[index_L] * U_trans_backup[k]; \n");
+ source.append(" } \n");
+
+ // update U_ij:
+ source.append(" U_trans_elements[j] = aij_U_trans[j] - sum; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+template<typename StringT>
+void generate_ilu_form_neumann_matrix_kernel(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void ilu_form_neumann_matrix_kernel( \n");
+ source.append(" __global unsigned int const *R_row_indices, \n");
+ source.append(" __global unsigned int const *R_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *R_elements, \n");
+ source.append(" unsigned int R_size1, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *D_elements) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); \n");
+ source.append(" row < R_size1; \n");
+ source.append(" row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_begin = R_row_indices[row]; \n");
+ source.append(" unsigned int row_end = R_row_indices[row+1]; \n");
+
+ // Part 1: Extract and set diagonal entry
+ source.append(" "); source.append(numeric_string); source.append(" diag = D_elements[row]; \n");
+ source.append(" for (unsigned int j=row_begin; j<row_end; ++j) { \n");
+ source.append(" unsigned int col = R_col_indices[j]; \n");
+ source.append(" if (col == row) { \n");
+ source.append(" diag = R_elements[j]; \n");
+ source.append(" R_elements[j] = 0; \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" D_elements[row] = diag; \n");
+
+ // Part 2: Scale
+ source.append(" for (unsigned int j=row_begin; j<row_end; ++j) \n");
+ source.append(" R_elements[j] /= -diag; \n");
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for incomplete LU factorization preconditioners. */
+template<class NumericT>
+struct ilu
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_ilu";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // only generate for floating points (forces error for integers)
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ generate_ilu_level_scheduling_substitute(source, numeric_string);
+
+ generate_icc_extract_L_1(source);
+ generate_icc_extract_L_2(source, numeric_string);
+ generate_icc_chow_patel_sweep_kernel(source, numeric_string);
+
+ generate_ilu_extract_LU_1(source);
+ generate_ilu_extract_LU_2(source, numeric_string);
+ generate_ilu_scale_kernel_1(source, numeric_string);
+ generate_ilu_scale_kernel_2(source, numeric_string);
+ generate_ilu_chow_patel_sweep_kernel(source, numeric_string);
+ generate_ilu_form_neumann_matrix_kernel(source, numeric_string);
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
[35/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
new file mode 100644
index 0000000..912d24d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
@@ -0,0 +1,2725 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/matrix_operations.hpp
+ @brief Implementations of dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/linalg/cuda/vector_operations.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_row.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_col.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2)
+{
+ assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ if (mat1.row_major())
+ {
+ convert_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("convert_row_kernel");
+ }
+ else
+ {
+ convert_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("convert_col_kernel");
+ }
+}
+
+template<typename NumericT, typename SizeT, typename DistanceT>
+void trans(matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> const & proxy,
+ matrix_base<NumericT> & temp_trans)
+{
+ trans_kernel<<<128,128>>>(viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(proxy.lhs().start1()), static_cast<unsigned int>(proxy.lhs().start2()),
+ static_cast<unsigned int>(proxy.lhs().internal_size1()), static_cast<unsigned int>(proxy.lhs().internal_size2()),
+ static_cast<unsigned int>(proxy.lhs().size1()), static_cast<unsigned int>(proxy.lhs().size2()),
+ static_cast<unsigned int>(proxy.lhs().stride1()), static_cast<unsigned int>(proxy.lhs().stride2()),
+
+ viennacl::cuda_arg(temp_trans),
+ static_cast<unsigned int>(temp_trans.start1()), static_cast<unsigned int>(temp_trans.start2()),
+ static_cast<unsigned int>(temp_trans.internal_size1()), static_cast<unsigned int>(temp_trans.internal_size2()),
+ static_cast<unsigned int>(temp_trans.stride1()), static_cast<unsigned int>(temp_trans.stride2()),
+ static_cast<bool>(proxy.lhs().row_major()));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("trans_kernel");
+}
+
+
+template<typename NumericT, typename ScalarT>
+void am(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT>::value)
+ temporary_alpha = alpha;
+
+ if (mat1.row_major())
+ {
+ am_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
+ }
+ else
+ {
+ am_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
+ }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<ScalarT2>::value)
+ temporary_beta = beta;
+
+
+ if (mat1.row_major())
+ {
+ ambm_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(mat3),
+ static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
+ }
+ else
+ {
+ ambm_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(mat3),
+ static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
+ }
+
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<ScalarT2>::value)
+ temporary_beta = beta;
+
+
+ if (mat1.row_major())
+ {
+ ambm_m_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(mat3),
+ static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
+ }
+ else
+ {
+ ambm_m_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(mat3),
+ static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
+ }
+
+}
+
+
+
+
+template<typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+ typedef NumericT value_type;
+ value_type alpha = s;
+
+ unsigned int s1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+ unsigned int s2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+
+ if (mat.row_major())
+ {
+
+ matrix_row_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ s1, s2,
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ alpha);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
+ }
+ else
+ {
+ matrix_col_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ s1, s2,
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ alpha);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
+ }
+}
+
+template<typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+ typedef NumericT value_type;
+ value_type alpha = s;
+
+ if (mat.row_major())
+ {
+ matrix_row_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ alpha);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
+ }
+ else
+ {
+ matrix_col_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ alpha);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
+ }
+}
+
+
+template<typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+ typedef NumericT value_type;
+
+ // Step 1: assign zero matrix:
+ matrix_assign(mat, NumericT(0));
+
+ // Step 2: Assign diagonal:
+ unsigned int options_alpha = 0;
+
+ vcl_size_t mat_start = 0;
+ vcl_size_t mat_stride = 0;
+ vcl_size_t mat_size = viennacl::traits::size(vec);
+ if (mat.row_major())
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+ + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+ mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+ }
+ else
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+ + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+ mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+ }
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(mat_start),
+ static_cast<unsigned int>(mat_stride),
+ static_cast<unsigned int>(mat_size),
+
+ viennacl::cuda_arg<value_type>(NumericT(1)),
+ options_alpha,
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_diag_to_vector(matrix_base<NumericT> const & mat, int k, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = 0;
+
+ vcl_size_t mat_start = 0;
+ vcl_size_t mat_stride = 0;
+ if (mat.row_major())
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+ + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+ mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+ }
+ else
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+ + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+ mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+ }
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)),
+ static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+ viennacl::cuda_arg<value_type>(NumericT(1)),
+ options_alpha,
+ viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(mat_start),
+ static_cast<unsigned int>(mat_stride));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = 0;
+
+ vcl_size_t mat_start = 0;
+ vcl_size_t mat_stride = 0;
+ if (mat.row_major())
+ {
+ mat_start = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat);
+ mat_stride = viennacl::traits::stride2(mat);
+ }
+ else
+ {
+ mat_start = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat);
+ mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat);
+ }
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)),
+ static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+ viennacl::cuda_arg<value_type>(NumericT(1)),
+ options_alpha,
+ viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(mat_start),
+ static_cast<unsigned int>(mat_stride));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = 0;
+
+ vcl_size_t mat_start = 0;
+ vcl_size_t mat_stride = 0;
+ if (mat.row_major())
+ {
+ mat_start = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat);
+ mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat);
+ }
+ else
+ {
+ mat_start = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+ mat_stride = viennacl::traits::stride2(mat);
+ }
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)),
+ static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+ viennacl::cuda_arg<value_type>(NumericT(1)),
+ options_alpha,
+ viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(mat_start),
+ static_cast<unsigned int>(mat_stride));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+
+//
+///////////////////////// binary element-wise operations /////////////////////////////////
+//
+
+
+template<typename NumericT, typename SizeT, typename OpT>
+void element_op(matrix_base<NumericT, SizeT> & A,
+ matrix_expression<const matrix_base<NumericT, SizeT>, const matrix_base<NumericT, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ if (A.row_major())
+ {
+ element_op_int_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+ }
+ else
+ {
+ element_op_int_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+ }
+}
+
+template<typename SizeT, typename OpT>
+void element_op(matrix_base<float, SizeT> & A,
+ matrix_expression<const matrix_base<float, SizeT>, const matrix_base<float, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef float value_type;
+
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ if (A.row_major())
+ {
+ element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+ }
+ else
+ {
+ element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+ }
+}
+
+template<typename SizeT, typename OpT>
+void element_op(matrix_base<double, SizeT> & A,
+ matrix_expression<const matrix_base<double, SizeT>, const matrix_base<double, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef double value_type;
+
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ if (A.row_major())
+ {
+ element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+ }
+ else
+ {
+ element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+ }
+}
+
+//
+///////////////////////// unary element-wise operations /////////////////////////////////
+//
+
+// Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
+// we could not find a more 'automatic' way of generating the overloads below...
+
+// abs
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_abs> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
+ }
+ else
+ {
+ matrix_col_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
+ }
+}
+
+
+// acos
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_acos> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
+ }
+ else
+ {
+ matrix_col_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
+ }
+}
+
+
+// asin
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_asin> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
+ }
+ else
+ {
+ matrix_col_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+ }
+}
+
+
+// atan
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_atan> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
+ }
+ else
+ {
+ matrix_col_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
+ }
+}
+
+
+// ceil
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
+ }
+ else
+ {
+ matrix_col_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
+ }
+}
+
+
+// cos
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cos> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
+ }
+ else
+ {
+ matrix_col_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
+ }
+}
+
+
+// cosh
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
+ }
+ else
+ {
+ matrix_col_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
+ }
+}
+
+
+// exp
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_exp> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
+ }
+ else
+ {
+ matrix_col_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
+ }
+}
+
+
+// fabs
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
+ }
+ else
+ {
+ matrix_col_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
+ }
+}
+
+
+// floor
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_floor> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
+ }
+ else
+ {
+ matrix_col_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
+ }
+}
+
+
+// log
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
+ }
+ else
+ {
+ matrix_col_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
+ }
+}
+
+
+// log10
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log10> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
+ }
+ else
+ {
+ matrix_col_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
+ }
+}
+
+
+// sin
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sin> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
+ }
+ else
+ {
+ matrix_col_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+ }
+}
+
+
+// sinh
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
+ }
+ else
+ {
+ matrix_col_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
+ }
+}
+
+
+// sqrt
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned i
<TRUNCATED>
[34/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
new file mode 100644
index 0000000..44684ce
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
@@ -0,0 +1,1847 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/matrix_operations_col.hpp
+ @brief Implementations of column-major dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_col_kernel(DestNumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const SrcNumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1];
+}
+
+//
+// am
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void am_col_kernel(NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
+ }
+}
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void am_col_kernel(NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
+ }
+}
+
+
+//
+// ambm
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_col_kernel(NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_col_kernel(NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_col_kernel(NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+}
+
+
+//
+// ambm_m
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
+ }
+ else
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
+ + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
+ }
+ }
+}
+
+
+
+//
+// assignments
+//
+
+template<typename NumericT>
+__global__ void matrix_col_assign_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ NumericT alpha)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
+}
+
+
+template<typename NumericT>
+__global__ void matrix_col_diagonal_assign_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ NumericT alpha)
+{
+ unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+ for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
+ A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha;
+}
+
+//
+// binary element-wise operations
+//
+
+template<typename NumericT>
+__global__ void element_op_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2,
+
+ unsigned int op_type) //0: product, 1: division, 2: pow
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (op_type == 2)
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
+ }
+ else if (op_type == 1)
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+ / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+ }
+ else if (op_type == 0)
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+ * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+ }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2,
+
+ unsigned int op_type) //0: product, 1: division, 2: pow
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+
+ if (op_type == 1)
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+ / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+ }
+ else if (op_type == 0)
+ {
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
+ = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
+ * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
+ }
+}
+
+
+//
+// unary element-wise operations
+//
+
+// abs
+template<typename NumericT>
+__global__ void matrix_col_element_abs_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// acos
+template<typename NumericT>
+__global__ void matrix_col_element_acos_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// asin
+template<typename NumericT>
+__global__ void matrix_col_element_asin_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void matrix_col_element_atan_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void matrix_col_element_ceil_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void matrix_col_element_cos_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void matrix_col_element_cosh_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void matrix_col_element_exp_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void matrix_col_element_fabs_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// floor
+template<typename NumericT>
+__global__ void matrix_col_element_floor_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// log
+template<typename NumericT>
+__global__ void matrix_col_element_log_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void matrix_col_element_log10_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void matrix_col_element_sin_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void matrix_col_element_sinh_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void matrix_col_element_sqrt_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void matrix_col_element_tan_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void matrix_col_element_tanh_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
+ for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
+}
+
+
+
+//
+// matrix-vector product
+//
+
+template<typename NumericT>
+__global__ void vec_mul_col_kernel(
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * v,
+ unsigned int v_start,
+ unsigned int v_inc,
+ unsigned int v_size,
+ NumericT * result,
+ unsigned int result_start,
+ unsigned int result_inc,
+ unsigned int result_size)
+{
+
+ for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x)
+ {
+ NumericT dot_prod = 0;
+ for (unsigned int col = 0; col < A_col_size; ++col)
+ dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
+ result[row * result_inc + result_start] = dot_prod;
+ }
+}
+
+
+template<typename NumericT>
+__global__ void trans_vec_mul_col_kernel(
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * v,
+ unsigned int v_start,
+ unsigned int v_inc,
+ unsigned int v_size,
+ NumericT * result,
+ unsigned int result_start,
+ unsigned int result_inc,
+ unsigned int result_size)
+{
+ __shared__ NumericT work[128];
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int lid = threadIdx.x;
+
+ for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x)
+ {
+ NumericT dot_prod = 0;
+ for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
+ dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
+ work[lid] = dot_prod;
+
+ for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
+ __syncthreads();
+ if (lid < stride)
+ work[lid] += work[lid+stride];
+ }
+
+ if (lid == 0)
+ result[row * result_inc + result_start] = work[0];
+ }
+}
+
+
+//
+// matrix-matrix products
+//
+
+
+
+
+//
+// scaled rank-1-update
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT val,
+ unsigned int options2,
+
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+ unsigned int size2)
+{
+ NumericT alpha = val;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ {
+ NumericT tmp = alpha * vec1[row * inc1 + start1];
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
+ }
+}
+
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_col_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * val,
+ unsigned int options2,
+
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+ unsigned int size2)
+{
+ NumericT alpha = *val;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ {
+ NumericT tmp = alpha * vec1[row * inc1 + start1];
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
+ }
+}
+
+
+template <typename T>
+__global__ void bidiag_pack_row_major_kernel(
+ T * A,
+ T * D,
+ T * S,
+ unsigned int size1,
+ unsigned int size2,
+ unsigned int stride)
+{
+ unsigned int size = min(size1, size2);
+ if(blockIdx.x * blockDim.x + threadIdx.x == 0)
+ S[0] = 0;
+
+ for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+ i < size;
+ i += gridDim.x * blockDim.x)
+ {
+ D[i] = A[i*stride + i];
+ S[i+1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0;
+ }
+}
+
+template <typename T>
+__global__ void bidiag_pack_column_major_kernel(
+ T * A,
+ T * D,
+ T * S,
+ unsigned int size1,
+ unsigned int size2,
+ unsigned int stride)
+{
+ unsigned int size = min(size1, size2);
+ if(blockIdx.x * blockDim.x + threadIdx.x == 0)
+ S[0] = 0;
+
+ for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+ i < size;
+ i += gridDim.x * blockDim.x)
+ {
+ D[i] = A[i*stride + i];
+ S[i+1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0;
+ }
+}
+
+
+
+template<typename T>
+__global__ void copy_col_row_major_kernel(
+ T * A,
+ T * V,
+ unsigned int row_start,
+ unsigned int col_start,
+ unsigned int size,
+ unsigned int stride)
+{
+ unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int sz = gridDim.x * blockDim.x;
+
+ for(unsigned int i = row_start + x; i < size; i += sz)
+ {
+ V[i - row_start] = A[i * stride + col_start];
+ }
+}
+
+template<typename T>
+__global__ void copy_col_column_major_kernel(
+ T * A,
+ T * V,
+ unsigned int row_start,
+ unsigned int col_start,
+ unsigned int size,
+ unsigned int stride)
+{
+ unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int sz = gridDim.x * blockDim.x;
+
+ for(unsigned int i = row_start + x; i < size; i += sz)
+ {
+ V[i - row_start] = A[i + col_start * stride];
+ }
+}
+
+template<typename T>
+__global__ void copy_row_row_major_kernel(
+ T * A,
+ T * V,
+ unsigned int row_start,
+ unsigned int col_start,
+ unsigned int size,
+ unsigned int stride)
+{
+ unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int sz = gridDim.x * blockDim.x;
+
+ for(unsigned int i = col_start + x; i < size; i += sz)
+ {
+ V[i - col_start] = A[row_start * stride + i];
+ }
+
+}
+
+template<typename T>
+__global__ void copy_row_column_major_kernel(
+ T * A,
+ T * V,
+ unsigned int row_start,
+ unsigned int col_start,
+ unsigned int size,
+ unsigned int stride)
+{
+ unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int sz = gridDim.x * blockDim.x;
+
+ for(unsigned int i = col_start + x; i < size; i += sz)
+ {
+ V[i - col_start] = A[row_start + i * stride];
+ }
+
+}
+
+
+
+template<typename T>
+__global__ void house_update_A_left_row_major_kernel(
+ T * A,
+ T * V, //householder vector
+ unsigned int row_start,
+ unsigned int col_start,
+ unsigned int size1,
+ unsigned int size2,
+ unsigned int stride)
+{
+ T ss = 0;
+
+ for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
+ i < size2;
+ i += gridDim.x * blockDim.x)
+ {
+ ss = 0;
+ for(unsigned int j = row_start; j < size1; j++)
+ ss = ss +(V[j] * A[j * stride + i]);
+
+ for(unsigned int j = row_start; j < size1; j++)
+ A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
+ }
+}
+
+template<typename T>
+__global__ void house_update_A_left_column_major_kernel(
+ T * A,
+ T * V, //householder vector
+ unsigned int row_start,
+ unsigned int col_start,
+ unsigned int size1,
+ unsigned int size2,
+ unsigned int stride)
+{
+ T ss = 0;
+
+ for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
+ i < size2;
+ i += gridDim.x * blockDim.x)
+ {
+ ss = 0;
+ for(unsigned int j = row_start; j < size1; j++)
+ ss = ss +(V[j] * A[j + i * stride]);
+
+ for(unsigned int j = row_start; j < size1; j++)
+ A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss);
+ }
+}
+
+
+
+template<typename T>
+__global__ void house_update_A_right_row_major_kernel(
+ T * A,
+ T * V, //householder vector
+ unsigned int row_start,
+ unsigned int col_start,
+ unsigned int size1,
+ unsigned int size2,
+ unsigned int stride)
+{
+ __shared__ T sums[128];
+ T ss = 0;
+
+ for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
+ {
+ ss = 0;
+ for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
+ ss = ss + (V[j] * A[i * stride + j]);
+ sums[threadIdx.x] = ss;
+
+ __syncthreads();
+ col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
+ __syncthreads();
+
+ T sum_Av = sums[0];
+
+ for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
+ A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
+ }
+}
+
+template<typename T>
+__global__ void house_update_A_right_column_major_kernel(
+ T * A,
+ T * V, //householder vector
+ unsigned int row_start,
+ unsigned int col_start,
+ unsigned int size1,
+ unsigned int size2,
+ unsigned int stride)
+{
+ __shared__ T sums[128];
+ T ss = 0;
+
+ for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
+ {
+ ss = 0;
+ for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
+ ss = ss + (V[j] * A[i + j * stride]);
+ sums[threadIdx.x] = ss;
+
+ __syncthreads();
+ col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
+ __syncthreads();
+
+ T sum_Av = sums[0];
+
+ for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
+ A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av);
+ }
+}
+
+
+
+template<typename T>
+__device__ void col_reduce_lcl_array(
+ T * sums,
+ unsigned int th_Idx,
+ unsigned int bl_Dim)
+{
+ unsigned int step = bl_Dim >> 1;
+
+ while(step > 0)
+ {
+ if(th_Idx < step)
+ sums[th_Idx] += sums[th_Idx + step];
+ step >>= 1;
+ __syncthreads();
+ }
+}
+
+
+template <typename T>
+__global__ void house_update_QL_row_major_kernel(
+ T * QL,
+ T * V,
+ unsigned int size1,
+ unsigned int strideQ)
+{
+ __shared__ T sums[128];
+ T ss = 0;
+ for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
+ {
+ ss = 0;
+ for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
+ ss = ss + (V[j] * QL[i * strideQ + j]);
+ sums[threadIdx.x] = ss;
+
+ __syncthreads();
+ col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
+ __syncthreads();
+
+ T sum_Qv = sums[0];
+
+ for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
+ QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
+ }
+}
+
+template <typename T>
+__global__ void house_update_QL_column_major_kernel(
+ T * QL,
+ T * V,
+ unsigned int size1,
+ unsigned int strideQ)
+{
+ __shared__ T sums[128];
+ T ss = 0;
+ for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
+ {
+ ss = 0;
+ for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
+ ss = ss + (V[j] * QL[i + j * strideQ]);
+ sums[threadIdx.x] = ss;
+
+ __syncthreads();
+ col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
+ __syncthreads();
+
+ T sum_Qv = sums[0];
+
+ for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
+ QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv);
+ }
+}
+
+
+template <typename T>
+__global__ void givens_next_row_major_kernel(
+ T * matr,
+ T * cs,
+ T * ss,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int start_i,
+ unsigned int end_i)
+{
+ unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
+ __shared__ T cs_lcl[256];
+ __shared__ T ss_lcl[256];
+
+ T x = (j < size) ? matr[(end_i + 1) + j * stride] : 0;
+
+ unsigned int elems_num = end_i - start_i + 1;
+ unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
+
+ for(unsigned int block_id = 0; block_id < block_num; block_id++)
+ {
+ unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
+
+ if(threadIdx.x < to)
+ {
+ cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
+ ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
+ }
+ __syncthreads();
+ if(j < size)
+ {
+ for(unsigned int ind = 0; ind < to; ind++)
+ {
+ unsigned int i = end_i - (ind + block_id * blockDim.x);
+ T z = matr[i + j * stride];
+ T cs_val = cs_lcl[ind];
+ T ss_val = ss_lcl[ind];
+ matr[(i + 1) + j * stride] = x * cs_val + z * ss_val;
+ x = -x * ss_val + z * cs_val;
+ }
+ }
+ __syncthreads();
+ }
+ if(j < size)
+ matr[(start_i) + j * stride] = x;
+}
+
+template <typename T>
+__global__ void givens_next_column_major_kernel(
+ T * matr,
+ T * cs,
+ T * ss,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int start_i,
+ unsigned int end_i)
+{
+ unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
+ __shared__ T cs_lcl[256];
+ __shared__ T ss_lcl[256];
+
+ T x = (j < size) ? matr[(end_i + 1) *stride + j] : 0;
+
+ unsigned int elems_num = end_i - start_i + 1;
+ unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
+
+ for(unsigned int block_id = 0; block_id < block_num; block_id++)
+ {
+ unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
+
+ if(threadIdx.x < to)
+ {
+ cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
+ ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
+ }
+ __syncthreads();
+ if(j < size)
+ {
+ for(unsigned int ind = 0; ind < to; ind++)
+ {
+ unsigned int i = end_i - (ind + block_id * blockDim.x);
+ T z = matr[i *stride + j];
+ T cs_val = cs_lcl[ind];
+ T ss_val = ss_lcl[ind];
+ matr[(i + 1) * stride + j] = x * cs_val + z * ss_val;
+ x = -x * ss_val + z * cs_val;
+ }
+ }
+ __syncthreads();
+ }
+ if(j < size)
+ matr[(start_i) * stride + j] = x;
+}
+
+
+
+
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[30/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
new file mode 100644
index 0000000..24bcf96
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
@@ -0,0 +1,761 @@
+#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
+#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
+ @brief Implementations of direct triangular solvers for sparse matrices using CUDA
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+//
+// Compressed matrix
+//
+
+//
+// non-transposed
+//
+
+template<typename NumericT>
+__global__ void csr_unit_lu_forward_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * vector,
+ unsigned int size)
+{
+ __shared__ unsigned int col_index_buffer[128];
+ __shared__ NumericT element_buffer[128];
+ __shared__ NumericT vector_buffer[128];
+
+ unsigned int nnz = row_indices[size];
+ unsigned int current_row = 0;
+ unsigned int row_at_window_start = 0;
+ NumericT current_vector_entry = vector[0];
+ unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
+ unsigned int next_row = row_indices[1];
+
+ for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+ {
+ //load into shared memory (coalesced access):
+ if (i < nnz)
+ {
+ element_buffer[threadIdx.x] = elements[i];
+ unsigned int tmp = column_indices[i];
+ col_index_buffer[threadIdx.x] = tmp;
+ vector_buffer[threadIdx.x] = vector[tmp];
+ }
+
+ __syncthreads();
+
+ //now a single thread does the remaining work in shared memory:
+ if (threadIdx.x == 0)
+ {
+ // traverse through all the loaded data:
+ for (unsigned int k=0; k<blockDim.x; ++k)
+ {
+ if (current_row < size && i+k == next_row) //current row is finished. Write back result
+ {
+ vector[current_row] = current_vector_entry;
+ ++current_row;
+ if (current_row < size) //load next row's data
+ {
+ next_row = row_indices[current_row+1];
+ current_vector_entry = vector[current_row];
+ }
+ }
+
+ if (current_row < size && col_index_buffer[k] < current_row) //substitute
+ {
+ if (col_index_buffer[k] < row_at_window_start) //use recently computed results
+ current_vector_entry -= element_buffer[k] * vector_buffer[k];
+ else if (col_index_buffer[k] < current_row) //use buffered data
+ current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+ }
+
+ } // for k
+
+ row_at_window_start = current_row;
+ } // if (get_local_id(0) == 0)
+
+ __syncthreads();
+ } //for i
+}
+
+
+
+template<typename NumericT>
+__global__ void csr_lu_forward_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * vector,
+ unsigned int size)
+{
+ __shared__ unsigned int col_index_buffer[128];
+ __shared__ NumericT element_buffer[128];
+ __shared__ NumericT vector_buffer[128];
+
+ unsigned int nnz = row_indices[size];
+ unsigned int current_row = 0;
+ unsigned int row_at_window_start = 0;
+ NumericT current_vector_entry = vector[0];
+ NumericT diagonal_entry = 0;
+ unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
+ unsigned int next_row = row_indices[1];
+
+ for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+ {
+ //load into shared memory (coalesced access):
+ if (i < nnz)
+ {
+ element_buffer[threadIdx.x] = elements[i];
+ unsigned int tmp = column_indices[i];
+ col_index_buffer[threadIdx.x] = tmp;
+ vector_buffer[threadIdx.x] = vector[tmp];
+ }
+
+ __syncthreads();
+
+ //now a single thread does the remaining work in shared memory:
+ if (threadIdx.x == 0)
+ {
+ // traverse through all the loaded data:
+ for (unsigned int k=0; k<blockDim.x; ++k)
+ {
+ if (current_row < size && i+k == next_row) //current row is finished. Write back result
+ {
+ vector[current_row] = current_vector_entry / diagonal_entry;
+ ++current_row;
+ if (current_row < size) //load next row's data
+ {
+ next_row = row_indices[current_row+1];
+ current_vector_entry = vector[current_row];
+ }
+ }
+
+ if (current_row < size && col_index_buffer[k] < current_row) //substitute
+ {
+ if (col_index_buffer[k] < row_at_window_start) //use recently computed results
+ current_vector_entry -= element_buffer[k] * vector_buffer[k];
+ else if (col_index_buffer[k] < current_row) //use buffered data
+ current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+ }
+ else if (col_index_buffer[k] == current_row)
+ diagonal_entry = element_buffer[k];
+
+ } // for k
+
+ row_at_window_start = current_row;
+ } // if (get_local_id(0) == 0)
+
+ __syncthreads();
+ } //for i
+}
+
+
+template<typename NumericT>
+__global__ void csr_unit_lu_backward_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * vector,
+ unsigned int size)
+{
+ __shared__ unsigned int col_index_buffer[128];
+ __shared__ NumericT element_buffer[128];
+ __shared__ NumericT vector_buffer[128];
+
+ unsigned int nnz = row_indices[size];
+ unsigned int current_row = size-1;
+ unsigned int row_at_window_start = size-1;
+ NumericT current_vector_entry = vector[size-1];
+ unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
+ unsigned int next_row = row_indices[size-1];
+
+ unsigned int i = loop_end + threadIdx.x;
+ while (1)
+ {
+ //load into shared memory (coalesced access):
+ if (i < nnz)
+ {
+ element_buffer[threadIdx.x] = elements[i];
+ unsigned int tmp = column_indices[i];
+ col_index_buffer[threadIdx.x] = tmp;
+ vector_buffer[threadIdx.x] = vector[tmp];
+ }
+
+ __syncthreads();
+
+ //now a single thread does the remaining work in shared memory:
+ if (threadIdx.x == 0)
+ {
+ // traverse through all the loaded data from back to front:
+ for (unsigned int k2=0; k2<blockDim.x; ++k2)
+ {
+ unsigned int k = (blockDim.x - k2) - 1;
+
+ if (i+k >= nnz)
+ continue;
+
+ if (col_index_buffer[k] > row_at_window_start) //use recently computed results
+ current_vector_entry -= element_buffer[k] * vector_buffer[k];
+ else if (col_index_buffer[k] > current_row) //use buffered data
+ current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+
+ if (i+k == next_row) //current row is finished. Write back result
+ {
+ vector[current_row] = current_vector_entry;
+ if (current_row > 0) //load next row's data
+ {
+ --current_row;
+ next_row = row_indices[current_row];
+ current_vector_entry = vector[current_row];
+ }
+ }
+
+
+ } // for k
+
+ row_at_window_start = current_row;
+ } // if (get_local_id(0) == 0)
+
+ __syncthreads();
+
+ if (i < blockDim.x)
+ break;
+
+ i -= blockDim.x;
+ } //for i
+}
+
+
+
+template<typename NumericT>
+__global__ void csr_lu_backward_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * vector,
+ unsigned int size)
+{
+ __shared__ unsigned int col_index_buffer[128];
+ __shared__ NumericT element_buffer[128];
+ __shared__ NumericT vector_buffer[128];
+
+ unsigned int nnz = row_indices[size];
+ unsigned int current_row = size-1;
+ unsigned int row_at_window_start = size-1;
+ NumericT current_vector_entry = vector[size-1];
+ NumericT diagonal_entry;
+ unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
+ unsigned int next_row = row_indices[size-1];
+
+ unsigned int i = loop_end + threadIdx.x;
+ while (1)
+ {
+ //load into shared memory (coalesced access):
+ if (i < nnz)
+ {
+ element_buffer[threadIdx.x] = elements[i];
+ unsigned int tmp = column_indices[i];
+ col_index_buffer[threadIdx.x] = tmp;
+ vector_buffer[threadIdx.x] = vector[tmp];
+ }
+
+ __syncthreads();
+
+ //now a single thread does the remaining work in shared memory:
+ if (threadIdx.x == 0)
+ {
+ // traverse through all the loaded data from back to front:
+ for (unsigned int k2=0; k2<blockDim.x; ++k2)
+ {
+ unsigned int k = (blockDim.x - k2) - 1;
+
+ if (i+k >= nnz)
+ continue;
+
+ if (col_index_buffer[k] > row_at_window_start) //use recently computed results
+ current_vector_entry -= element_buffer[k] * vector_buffer[k];
+ else if (col_index_buffer[k] > current_row) //use buffered data
+ current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
+ else if (col_index_buffer[k] == current_row)
+ diagonal_entry = element_buffer[k];
+
+ if (i+k == next_row) //current row is finished. Write back result
+ {
+ vector[current_row] = current_vector_entry / diagonal_entry;
+ if (current_row > 0) //load next row's data
+ {
+ --current_row;
+ next_row = row_indices[current_row];
+ current_vector_entry = vector[current_row];
+ }
+ }
+
+
+ } // for k
+
+ row_at_window_start = current_row;
+ } // if (get_local_id(0) == 0)
+
+ __syncthreads();
+
+ if (i < blockDim.x)
+ break;
+
+ i -= blockDim.x;
+ } //for i
+}
+
+
+
+//
+// transposed
+//
+
+
+template<typename NumericT>
+__global__ void csr_trans_lu_forward_kernel2(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * vector,
+ unsigned int size)
+{
+ for (unsigned int row = 0; row < size; ++row)
+ {
+ NumericT result_entry = vector[row];
+
+ unsigned int row_start = row_indices[row];
+ unsigned int row_stop = row_indices[row + 1];
+ for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
+ {
+ unsigned int col_index = column_indices[entry_index];
+ if (col_index > row)
+ vector[col_index] -= result_entry * elements[entry_index];
+ }
+
+ __syncthreads();
+ }
+}
+
+template<typename NumericT>
+__global__ void csr_trans_unit_lu_forward_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * vector,
+ unsigned int size)
+{
+ __shared__ unsigned int row_index_lookahead[256];
+ __shared__ unsigned int row_index_buffer[256];
+
+ unsigned int row_index;
+ unsigned int col_index;
+ NumericT matrix_entry;
+ unsigned int nnz = row_indices[size];
+ unsigned int row_at_window_start = 0;
+ unsigned int row_at_window_end = 0;
+ unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+ for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+ {
+ col_index = (i < nnz) ? column_indices[i] : 0;
+ matrix_entry = (i < nnz) ? elements[i] : 0;
+ row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
+
+ __syncthreads();
+
+ if (i < nnz)
+ {
+ unsigned int row_index_inc = 0;
+ while (i >= row_index_lookahead[row_index_inc + 1])
+ ++row_index_inc;
+ row_index = row_at_window_start + row_index_inc;
+ row_index_buffer[threadIdx.x] = row_index;
+ }
+ else
+ {
+ row_index = size+1;
+ row_index_buffer[threadIdx.x] = size - 1;
+ }
+
+ __syncthreads();
+
+ row_at_window_start = row_index_buffer[0];
+ row_at_window_end = row_index_buffer[blockDim.x - 1];
+
+ //forward elimination
+ for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
+ {
+ NumericT result_entry = vector[row];
+
+ if ( (row_index == row) && (col_index > row) )
+ vector[col_index] -= result_entry * matrix_entry;
+
+ __syncthreads();
+ }
+
+ row_at_window_start = row_at_window_end;
+ }
+
+}
+
+template<typename NumericT>
+__global__ void csr_trans_lu_forward_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ const NumericT * diagonal_entries,
+ NumericT * vector,
+ unsigned int size)
+{
+ __shared__ unsigned int row_index_lookahead[256];
+ __shared__ unsigned int row_index_buffer[256];
+
+ unsigned int row_index;
+ unsigned int col_index;
+ NumericT matrix_entry;
+ unsigned int nnz = row_indices[size];
+ unsigned int row_at_window_start = 0;
+ unsigned int row_at_window_end = 0;
+ unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+ for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
+ {
+ col_index = (i < nnz) ? column_indices[i] : 0;
+ matrix_entry = (i < nnz) ? elements[i] : 0;
+ row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : nnz;
+
+ __syncthreads();
+
+ if (i < nnz)
+ {
+ unsigned int row_index_inc = 0;
+ while (i >= row_index_lookahead[row_index_inc + 1])
+ ++row_index_inc;
+ row_index = row_at_window_start + row_index_inc;
+ row_index_buffer[threadIdx.x] = row_index;
+ }
+ else
+ {
+ row_index = size+1;
+ row_index_buffer[threadIdx.x] = size - 1;
+ }
+
+ __syncthreads();
+
+ row_at_window_start = row_index_buffer[0];
+ row_at_window_end = row_index_buffer[blockDim.x - 1];
+
+ //forward elimination
+ for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
+ {
+ NumericT result_entry = vector[row] / diagonal_entries[row];
+
+ if ( (row_index == row) && (col_index > row) )
+ vector[col_index] -= result_entry * matrix_entry;
+
+ __syncthreads();
+ }
+
+ row_at_window_start = row_at_window_end;
+ }
+
+ // final step: Divide vector by diagonal entries:
+ for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
+ vector[i] /= diagonal_entries[i];
+
+}
+
+
+template<typename NumericT>
+__global__ void csr_trans_unit_lu_backward_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * vector,
+ unsigned int size)
+{
+ __shared__ unsigned int row_index_lookahead[256];
+ __shared__ unsigned int row_index_buffer[256];
+
+ unsigned int row_index;
+ unsigned int col_index;
+ NumericT matrix_entry;
+ unsigned int nnz = row_indices[size];
+ unsigned int row_at_window_start = size;
+ unsigned int row_at_window_end;
+ unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+ for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
+ {
+ unsigned int i = (nnz - i2) - 1;
+ col_index = (i2 < nnz) ? column_indices[i] : 0;
+ matrix_entry = (i2 < nnz) ? elements[i] : 0;
+ row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
+
+ __syncthreads();
+
+ if (i2 < nnz)
+ {
+ unsigned int row_index_dec = 0;
+ while (row_index_lookahead[row_index_dec] > i)
+ ++row_index_dec;
+ row_index = row_at_window_start - row_index_dec;
+ row_index_buffer[threadIdx.x] = row_index;
+ }
+ else
+ {
+ row_index = size+1;
+ row_index_buffer[threadIdx.x] = 0;
+ }
+
+ __syncthreads();
+
+ row_at_window_start = row_index_buffer[0];
+ row_at_window_end = row_index_buffer[blockDim.x - 1];
+
+ //backward elimination
+ for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
+ {
+ unsigned int row = row_at_window_start - row2;
+ NumericT result_entry = vector[row];
+
+ if ( (row_index == row) && (col_index < row) )
+ vector[col_index] -= result_entry * matrix_entry;
+
+ __syncthreads();
+ }
+
+ row_at_window_start = row_at_window_end;
+ }
+
+}
+
+
+
+template<typename NumericT>
+__global__ void csr_trans_lu_backward_kernel2(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ const NumericT * diagonal_entries,
+ NumericT * vector,
+ unsigned int size)
+{
+ NumericT result_entry = 0;
+
+ //backward elimination, using U and D:
+ for (unsigned int row2 = 0; row2 < size; ++row2)
+ {
+ unsigned int row = (size - row2) - 1;
+ result_entry = vector[row] / diagonal_entries[row];
+
+ unsigned int row_start = row_indices[row];
+ unsigned int row_stop = row_indices[row + 1];
+ for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
+ {
+ unsigned int col_index = column_indices[entry_index];
+ if (col_index < row)
+ vector[col_index] -= result_entry * elements[entry_index];
+ }
+
+ __syncthreads();
+
+ if (threadIdx.x == 0)
+ vector[row] = result_entry;
+ }
+}
+
+
+template<typename NumericT>
+__global__ void csr_trans_lu_backward_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ const NumericT * diagonal_entries,
+ NumericT * vector,
+ unsigned int size)
+{
+ __shared__ unsigned int row_index_lookahead[256];
+ __shared__ unsigned int row_index_buffer[256];
+
+ unsigned int row_index;
+ unsigned int col_index;
+ NumericT matrix_entry;
+ unsigned int nnz = row_indices[size];
+ unsigned int row_at_window_start = size;
+ unsigned int row_at_window_end;
+ unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
+
+ for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
+ {
+ unsigned int i = (nnz - i2) - 1;
+ col_index = (i2 < nnz) ? column_indices[i] : 0;
+ matrix_entry = (i2 < nnz) ? elements[i] : 0;
+ row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
+
+ __syncthreads();
+
+ if (i2 < nnz)
+ {
+ unsigned int row_index_dec = 0;
+ while (row_index_lookahead[row_index_dec] > i)
+ ++row_index_dec;
+ row_index = row_at_window_start - row_index_dec;
+ row_index_buffer[threadIdx.x] = row_index;
+ }
+ else
+ {
+ row_index = size+1;
+ row_index_buffer[threadIdx.x] = 0;
+ }
+
+ __syncthreads();
+
+ row_at_window_start = row_index_buffer[0];
+ row_at_window_end = row_index_buffer[blockDim.x - 1];
+
+ //backward elimination
+ for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
+ {
+ unsigned int row = row_at_window_start - row2;
+ NumericT result_entry = vector[row] / diagonal_entries[row];
+
+ if ( (row_index == row) && (col_index < row) )
+ vector[col_index] -= result_entry * matrix_entry;
+
+ __syncthreads();
+ }
+
+ row_at_window_start = row_at_window_end;
+ }
+
+
+ // final step: Divide vector by diagonal entries:
+ for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
+ vector[i] /= diagonal_entries[i];
+
+}
+
+
+template<typename NumericT>
+__global__ void csr_block_trans_unit_lu_forward(
+ const unsigned int * row_jumper_L, //L part (note that L is transposed in memory)
+ const unsigned int * column_indices_L,
+ const NumericT * elements_L,
+ const unsigned int * block_offsets,
+ NumericT * result,
+ unsigned int size)
+{
+ unsigned int col_start = block_offsets[2*blockIdx.x];
+ unsigned int col_stop = block_offsets[2*blockIdx.x+1];
+ unsigned int row_start = row_jumper_L[col_start];
+ unsigned int row_stop;
+ NumericT result_entry = 0;
+
+ if (col_start >= col_stop)
+ return;
+
+ //forward elimination, using L:
+ for (unsigned int col = col_start; col < col_stop; ++col)
+ {
+ result_entry = result[col];
+ row_stop = row_jumper_L[col + 1];
+ for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
+ result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
+ row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM)
+ __syncthreads();
+ }
+
+}
+
+
+template<typename NumericT>
+__global__ void csr_block_trans_lu_backward(
+ const unsigned int * row_jumper_U, //U part (note that U is transposed in memory)
+ const unsigned int * column_indices_U,
+ const NumericT * elements_U,
+ const NumericT * diagonal_U,
+ const unsigned int * block_offsets,
+ NumericT * result,
+ unsigned int size)
+{
+ unsigned int col_start = block_offsets[2*blockIdx.x];
+ unsigned int col_stop = block_offsets[2*blockIdx.x+1];
+ unsigned int row_start;
+ unsigned int row_stop;
+ NumericT result_entry = 0;
+
+ if (col_start >= col_stop)
+ return;
+
+ //backward elimination, using U and diagonal_U
+ for (unsigned int iter = 0; iter < col_stop - col_start; ++iter)
+ {
+ unsigned int col = (col_stop - iter) - 1;
+ result_entry = result[col] / diagonal_U[col];
+ row_start = row_jumper_U[col];
+ row_stop = row_jumper_U[col + 1];
+ for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
+ result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
+ __syncthreads();
+ }
+
+ //divide result vector by diagonal:
+ for (unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
+ result[col] /= diagonal_U[col];
+}
+
+
+
+//
+// Coordinate Matrix
+//
+
+
+
+
+//
+// ELL Matrix
+//
+
+
+
+//
+// Hybrid Matrix
+//
+
+
+
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp
new file mode 100644
index 0000000..5551cda
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm.hpp
@@ -0,0 +1,793 @@
+#ifndef VIENNACL_LINALG_CUDA_SPGEMM_HPP_
+#define VIENNACL_LINALG_CUDA_SPGEMM_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp
+ @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include <stdexcept>
+
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/tools/timer.hpp"
+
+#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Loads a value from the specified address. With CUDA arch 3.5 and above the value is also stored in global constant memory for later reuse */
+template<typename NumericT>
+static inline __device__ NumericT load_and_cache(const NumericT *address)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+ return __ldg(address);
+#else
+ return *address;
+#endif
+}
+
+
+//
+// Stage 1: Obtain upper bound for number of elements per row in C:
+//
+template<typename IndexT>
+__device__ IndexT round_to_next_power_of_2(IndexT val)
+{
+ if (val > 32)
+ return 64; // just to indicate that we need to split/factor the matrix!
+ else if (val > 16)
+ return 32;
+ else if (val > 8)
+ return 16;
+ else if (val > 4)
+ return 8;
+ else if (val > 2)
+ return 4;
+ else if (val > 1)
+ return 2;
+ else
+ return 1;
+}
+
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_stage_1(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ IndexT A_size1,
+ const IndexT * B_row_indices,
+ IndexT *subwarpsize_per_group,
+ IndexT *max_nnz_row_A_per_group,
+ IndexT *max_nnz_row_B_per_group)
+{
+ unsigned int subwarpsize_in_thread = 0;
+ unsigned int max_nnz_row_A = 0;
+ unsigned int max_nnz_row_B = 0;
+
+ unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+ unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+ for (unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; row < row_per_group_end; row += blockDim.x)
+ {
+ unsigned int A_row_start = A_row_indices[row];
+ unsigned int A_row_end = A_row_indices[row+1];
+ unsigned int row_num = A_row_end - A_row_start;
+ if (row_num > 32) // too many rows in B need to be merged for a single pass
+ {
+ unsigned int subwarp_sqrt = (unsigned int)sqrt(double(row_num)) + 1;
+ subwarpsize_in_thread = max(subwarp_sqrt, subwarpsize_in_thread);
+ }
+ else
+ subwarpsize_in_thread = max(A_row_end - A_row_start, subwarpsize_in_thread);
+ max_nnz_row_A = max(max_nnz_row_A, row_num);
+ for (unsigned int j = A_row_start; j < A_row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
+ max_nnz_row_B = max(row_len_B, max_nnz_row_B);
+ }
+ }
+
+ // reduction to obtain maximum in thread block
+ __shared__ unsigned int shared_subwarpsize[256];
+ __shared__ unsigned int shared_max_nnz_row_A[256];
+ __shared__ unsigned int shared_max_nnz_row_B[256];
+
+ shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
+ shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
+ shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_subwarpsize[threadIdx.x] = max( shared_subwarpsize[threadIdx.x], shared_subwarpsize[threadIdx.x + stride]);
+ shared_max_nnz_row_A[threadIdx.x] = max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + stride]);
+ shared_max_nnz_row_B[threadIdx.x] = max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + stride]);
+ }
+ }
+
+ if (threadIdx.x == 0)
+ {
+ subwarpsize_per_group[blockIdx.x] = round_to_next_power_of_2(shared_subwarpsize[0]);
+ max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
+ max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
+ }
+}
+
+//
+// Stage 2: Determine sparsity pattern of C
+//
+inline __device__ unsigned int merge_subwarp_symbolic(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2, unsigned int subwarpsize)
+{
+ unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+
+ unsigned int num_nnz = 0;
+ while (1)
+ {
+ // determine current minimum (warp shuffle)
+ unsigned int min_index = current_front_index;
+ for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
+ min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
+
+ if (min_index == B_size2)
+ break;
+
+ // update front:
+ current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
+ : current_front_index;
+ ++num_nnz;
+ }
+
+ return num_nnz;
+}
+
+inline __device__ unsigned int merge_subwarp_symbolic_double(unsigned int row_B_start, unsigned int row_B_end, unsigned int const *B_col_indices, unsigned int B_size2,
+ unsigned int *output_array, unsigned int id_in_warp, unsigned int subwarpsize)
+{
+ unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+
+ unsigned int num_nnz = 0;
+ unsigned int index_buffer = 0;
+ unsigned int buffer_size = 0;
+ while (1)
+ {
+ // determine current minimum (warp shuffle)
+ unsigned int min_index = current_front_index;
+ for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
+ min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
+
+ if (min_index == B_size2)
+ break;
+
+ // update front:
+ current_front_index = (current_front_index == min_index) ? ((++row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2)
+ : current_front_index;
+
+ // write output
+ index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
+ ++buffer_size;
+
+ if (buffer_size == subwarpsize) // register buffer full?
+ {
+ output_array[id_in_warp] = index_buffer;
+ output_array += subwarpsize;
+ buffer_size = 0;
+ }
+
+ ++num_nnz;
+ }
+
+ // write remaining entries from register buffer:
+ if (id_in_warp < buffer_size)
+ output_array[id_in_warp] = index_buffer;
+
+ return num_nnz;
+}
+
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_stage_2(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ IndexT A_size1,
+ const IndexT * B_row_indices,
+ const IndexT * B_col_indices,
+ IndexT B_size2,
+ IndexT * C_row_indices,
+ unsigned int *subwarpsize_array,
+ unsigned int *max_row_size_A,
+ unsigned int *max_row_size_B,
+ unsigned int *scratchpad_offsets,
+ unsigned int *scratchpad_indices)
+{
+ unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
+
+ unsigned int num_warps = blockDim.x / subwarpsize;
+ unsigned int warp_id = threadIdx.x / subwarpsize;
+ unsigned int id_in_warp = threadIdx.x % subwarpsize;
+
+ unsigned int scratchpad_rowlength = max_row_size_B[blockIdx.x] * subwarpsize;
+ unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
+ unsigned int *subwarp_scratchpad_start = scratchpad_indices + scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
+
+ unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+ unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+ for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
+ {
+ unsigned int row_A_start = A_row_indices[row];
+ unsigned int row_A_end = A_row_indices[row+1];
+
+ if (row_A_end - row_A_start > subwarpsize)
+ {
+ unsigned int final_merge_start = 0;
+ unsigned int final_merge_end = 0;
+
+ // merge to temporary scratchpad memory:
+ unsigned int *subwarp_scratchpad = subwarp_scratchpad_start;
+ unsigned int iter = 0;
+ while (row_A_end > row_A_start)
+ {
+ unsigned int my_row_B = row_A_start + id_in_warp;
+ unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+ unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+ unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+
+ unsigned int nnz_in_merge = merge_subwarp_symbolic_double(row_B_start, row_B_end, B_col_indices, B_size2,
+ subwarp_scratchpad, id_in_warp, subwarpsize);
+
+ final_merge_start = (iter == id_in_warp) ? subwarp_scratchpad - scratchpad_indices : final_merge_start;
+ final_merge_end = (iter == id_in_warp) ? final_merge_start + nnz_in_merge : final_merge_end;
+ ++iter;
+
+ row_A_start += subwarpsize;
+ subwarp_scratchpad += scratchpad_rowlength; // write to next row in scratchpad
+ }
+
+ // final merge:
+ unsigned int num_nnz = merge_subwarp_symbolic(final_merge_start, final_merge_end, scratchpad_indices, B_size2, subwarpsize);
+
+ if (id_in_warp == 0)
+ C_row_indices[row] = num_nnz;
+ }
+ else
+ {
+ // single merge
+ unsigned int my_row_B = row_A_start + id_in_warp;
+ unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+ unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+ unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+
+ unsigned int num_nnz = merge_subwarp_symbolic(row_B_start, row_B_end, B_col_indices, B_size2, subwarpsize);
+
+ if (id_in_warp == 0)
+ C_row_indices[row] = num_nnz;
+ }
+ }
+
+}
+
+
+//
+// Stage 3: Fill C with values
+//
+template<typename NumericT>
+__device__ unsigned int merge_subwarp_numeric(NumericT scaling_factor,
+ unsigned int input_start, unsigned int input_end, const unsigned int *input_indices, const NumericT *input_values, unsigned int invalid_token,
+ unsigned int *output_indices, NumericT *output_values,
+ unsigned int id_in_warp, unsigned int subwarpsize)
+{
+ unsigned int current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
+ NumericT current_front_value = (input_start < input_end) ? load_and_cache(input_values + input_start) : 0;
+
+ unsigned int index_buffer = 0;
+ NumericT value_buffer = 0;
+ unsigned int buffer_size = 0;
+ unsigned int nnz_written = 0;
+ while (1)
+ {
+ // determine current minimum:
+ unsigned int min_index = current_front_index;
+ for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
+ min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
+
+ if (min_index == invalid_token) // done
+ break;
+
+ // compute entry in C:
+ NumericT output_value = (current_front_index == min_index) ? scaling_factor * current_front_value : 0;
+ for (unsigned int i = subwarpsize/2; i >= 1; i /= 2)
+ output_value += __shfl_xor((int)output_value, (int)i);
+
+ // update front:
+ if (current_front_index == min_index)
+ {
+ ++input_start;
+ current_front_index = (input_start < input_end) ? load_and_cache(input_indices + input_start) : invalid_token;
+ current_front_value = (input_start < input_end) ? load_and_cache(input_values + input_start) : 0;
+ }
+
+ // write current front to register buffer:
+ index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
+ value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
+ ++buffer_size;
+
+ // flush register buffer via a coalesced write once full:
+ if (buffer_size == subwarpsize)
+ {
+ output_indices[id_in_warp] = index_buffer; output_indices += subwarpsize;
+ output_values[id_in_warp] = value_buffer; output_values += subwarpsize;
+ buffer_size = 0;
+ }
+
+ ++nnz_written;
+ }
+
+ // write remaining entries in register buffer to C:
+ if (id_in_warp < buffer_size)
+ {
+ output_indices[id_in_warp] = index_buffer;
+ output_values[id_in_warp] = value_buffer;
+ }
+
+ return nnz_written;
+}
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_stage_3(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ const NumericT * A_elements,
+ IndexT A_size1,
+ const IndexT * B_row_indices,
+ const IndexT * B_col_indices,
+ const NumericT * B_elements,
+ IndexT B_size2,
+ IndexT const * C_row_indices,
+ IndexT * C_col_indices,
+ NumericT * C_elements,
+ unsigned int *subwarpsize_array,
+ unsigned int *max_row_size_A,
+ unsigned int *max_row_size_B,
+ unsigned int *scratchpad_offsets,
+ unsigned int *scratchpad_indices,
+ NumericT *scratchpad_values)
+{
+ unsigned int subwarpsize = subwarpsize_array[blockIdx.x];
+
+ unsigned int num_warps = blockDim.x / subwarpsize;
+ unsigned int warp_id = threadIdx.x / subwarpsize;
+ unsigned int id_in_warp = threadIdx.x % subwarpsize;
+
+ unsigned int scratchpad_rowlength = max_row_size_B[blockIdx.x] * subwarpsize;
+ unsigned int scratchpad_rows_per_warp = max_row_size_A[blockIdx.x] / subwarpsize + 1;
+ unsigned int subwarp_scratchpad_shift = scratchpad_offsets[blockIdx.x] + warp_id * scratchpad_rows_per_warp * scratchpad_rowlength;
+
+ unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+ unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+ for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
+ {
+ unsigned int row_A_start = A_row_indices[row];
+ unsigned int row_A_end = A_row_indices[row+1];
+
+ if (row_A_end - row_A_start > subwarpsize)
+ {
+ // first merge stage:
+ unsigned int final_merge_start = 0;
+ unsigned int final_merge_end = 0;
+ unsigned int iter = 0;
+ unsigned int *scratchpad_indices_ptr = scratchpad_indices + subwarp_scratchpad_shift;
+ NumericT *scratchpad_values_ptr = scratchpad_values + subwarp_scratchpad_shift;
+
+ while (row_A_start < row_A_end)
+ {
+ unsigned int my_row_B = row_A_start + id_in_warp;
+ unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+ unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+ unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+ NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
+
+ unsigned int nnz_written = merge_subwarp_numeric(val_A,
+ row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
+ scratchpad_indices_ptr, scratchpad_values_ptr,
+ id_in_warp, subwarpsize);
+
+ if (iter == id_in_warp)
+ {
+ final_merge_start = scratchpad_indices_ptr - scratchpad_indices;
+ final_merge_end = final_merge_start + nnz_written;
+ }
+ ++iter;
+
+ row_A_start += subwarpsize;
+ scratchpad_indices_ptr += scratchpad_rowlength;
+ scratchpad_values_ptr += scratchpad_rowlength;
+ }
+
+ // second merge stage:
+ unsigned int index_in_C = C_row_indices[row];
+ merge_subwarp_numeric(NumericT(1),
+ final_merge_start, final_merge_end, scratchpad_indices, scratchpad_values, B_size2,
+ C_col_indices + index_in_C, C_elements + index_in_C,
+ id_in_warp, subwarpsize);
+ }
+ else
+ {
+ unsigned int my_row_B = row_A_start + id_in_warp;
+ unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+ unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+ unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+ NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
+
+ unsigned int index_in_C = C_row_indices[row];
+
+ merge_subwarp_numeric(val_A,
+ row_B_start, row_B_end, B_col_indices, B_elements, B_size2,
+ C_col_indices + index_in_C, C_elements + index_in_C,
+ id_in_warp, subwarpsize);
+ }
+ }
+
+}
+
+
+
+
+//
+// Decomposition kernels:
+//
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_decompose_1(
+ const IndexT * A_row_indices,
+ IndexT A_size1,
+ IndexT max_per_row,
+ IndexT *chunks_per_row)
+{
+ for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
+ {
+ IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
+ chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
+ }
+}
+
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_A2(
+ IndexT * A2_row_indices,
+ IndexT * A2_col_indices,
+ NumericT * A2_elements,
+ IndexT A2_size1,
+ IndexT *new_row_buffer)
+{
+ for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
+ {
+ unsigned int index_start = new_row_buffer[i];
+ unsigned int index_stop = new_row_buffer[i+1];
+
+ A2_row_indices[i] = index_start;
+
+ for (IndexT j = index_start; j < index_stop; ++j)
+ {
+ A2_col_indices[j] = j;
+ A2_elements[j] = NumericT(1);
+ }
+ }
+
+ // write last entry in row_buffer with global thread 0:
+ if (threadIdx.x == 0 && blockIdx.x == 0)
+ A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
+}
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_G1(
+ IndexT * G1_row_indices,
+ IndexT * G1_col_indices,
+ NumericT * G1_elements,
+ IndexT G1_size1,
+ IndexT const *A_row_indices,
+ IndexT const *A_col_indices,
+ NumericT const *A_elements,
+ IndexT A_size1,
+ IndexT A_nnz,
+ IndexT max_per_row,
+ IndexT *new_row_buffer)
+{
+ // Part 1: Copy column indices and entries:
+ for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
+ {
+ G1_col_indices[i] = A_col_indices[i];
+ G1_elements[i] = A_elements[i];
+ }
+
+ // Part 2: Derive new row indicies:
+ for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
+ {
+ unsigned int old_start = A_row_indices[i];
+ unsigned int new_start = new_row_buffer[i];
+ unsigned int row_chunks = new_row_buffer[i+1] - new_start;
+
+ for (IndexT j=0; j<row_chunks; ++j)
+ G1_row_indices[new_start + j] = old_start + j * max_per_row;
+ }
+
+ // write last entry in row_buffer with global thread 0:
+ if (threadIdx.x == 0 && blockIdx.x == 0)
+ G1_row_indices[G1_size1] = A_row_indices[A_size1];
+}
+
+
+
+/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices
+*
+* Implementation of the convenience expression C = prod(A, B);
+* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B
+*
+* @param A Left factor
+* @param B Right factor
+* @param C Result matrix
+*/
+template<class NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ viennacl::compressed_matrix<NumericT, AlignmentV> const & B,
+ viennacl::compressed_matrix<NumericT, AlignmentV> & C)
+{
+ C.resize(A.size1(), B.size2(), false);
+
+ unsigned int blocknum = 256;
+ unsigned int threadnum = 128;
+
+ viennacl::vector<unsigned int> subwarp_sizes(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+ viennacl::vector<unsigned int> max_nnz_row_A(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+ viennacl::vector<unsigned int> max_nnz_row_B(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+ viennacl::tools::timer timer;
+#endif
+
+ //
+ // Stage 1: Determine upper bound for number of nonzeros
+ //
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+ cudaDeviceSynchronize();
+ timer.start();
+#endif
+
+ compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg(subwarp_sizes),
+ viennacl::cuda_arg(max_nnz_row_A),
+ viennacl::cuda_arg(max_nnz_row_B)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_1");
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+ cudaDeviceSynchronize();
+ std::cout << "Stage 1 device: " << timer.get() << std::endl;
+ timer.start();
+#endif
+
+ subwarp_sizes.switch_memory_context(viennacl::context(MAIN_MEMORY));
+ unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle());
+
+ max_nnz_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
+ unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle());
+
+ max_nnz_row_B.switch_memory_context(viennacl::context(MAIN_MEMORY));
+ unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle());
+
+ //std::cout << "Subwarp sizes: " << subwarp_sizes << std::endl;
+
+ viennacl::vector<unsigned int> scratchpad_offsets(blocknum, viennacl::context(MAIN_MEMORY)); // upper bound for the nonzeros per row encountered for each work group
+ unsigned int * scratchpad_offsets_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(scratchpad_offsets.handle());
+
+ unsigned int max_subwarp_size = 0;
+ unsigned int A_max_nnz_per_row = 0;
+ unsigned int scratchpad_offset = 0;
+ //std::cout << "Scratchpad offsets: " << std::endl;
+ for (std::size_t i=0; i<subwarp_sizes.size(); ++i)
+ {
+ max_subwarp_size = std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
+ A_max_nnz_per_row = std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
+
+ scratchpad_offsets_ptr[i] = scratchpad_offset;
+ //std::cout << scratchpad_offset << " (with " << (max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1) << " warp reloads per group at " << max_nnz_row_A_ptr[i] << " max rows, "
+ // << upper_bound_nonzeros_per_row_C_ptr[i] << " row length, "
+ // << (256 / subwarp_sizes_ptr[i]) << " warps per group " << std::endl;
+ unsigned int max_warp_reloads = max_nnz_row_A_ptr[i] / subwarp_sizes_ptr[i] + 1;
+ unsigned int max_row_length_after_warp_merge = subwarp_sizes_ptr[i] * max_nnz_row_B_ptr[i];
+ unsigned int warps_in_group = threadnum / subwarp_sizes_ptr[i];
+ scratchpad_offset += max_warp_reloads
+ * max_row_length_after_warp_merge
+ * warps_in_group;
+ }
+ //std::cout << "Scratchpad memory for indices: " << scratchpad_offset << " entries (" << scratchpad_offset * sizeof(unsigned int) * 1e-6 << " MB)" << std::endl;
+
+ if (max_subwarp_size > 32)
+ {
+ // determine augmented size:
+ unsigned int max_entries_in_G = 1024;
+ if (A_max_nnz_per_row <= 512*512)
+ max_entries_in_G = 512;
+ if (A_max_nnz_per_row <= 256*256)
+ max_entries_in_G = 256;
+ if (A_max_nnz_per_row <= 128*128)
+ max_entries_in_G = 128;
+ if (A_max_nnz_per_row <= 64*64)
+ max_entries_in_G = 64;
+
+ viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
+ compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ static_cast<unsigned int>(A.size1()),
+ static_cast<unsigned int>(max_entries_in_G),
+ viennacl::cuda_arg(exclusive_scan_helper)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_decompose_1");
+
+ thrust::exclusive_scan(thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper)),
+ thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper) + exclusive_scan_helper.size()),
+ thrust::device_ptr<unsigned int>(viennacl::cuda_arg(exclusive_scan_helper)));
+
+ unsigned int augmented_size = exclusive_scan_helper[A.size1()];
+
+ // split A = A2 * G1
+ viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
+ viennacl::compressed_matrix<NumericT, AlignmentV> G1(augmented_size, A.size2(), A.nnz(), viennacl::traits::context(A));
+
+ // fill A2:
+ compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
+ viennacl::cuda_arg<unsigned int>(A2.handle2()),
+ viennacl::cuda_arg<NumericT>(A2.handle()),
+ static_cast<unsigned int>(A2.size1()),
+ viennacl::cuda_arg(exclusive_scan_helper)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_A2");
+
+ // fill G1:
+ compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
+ viennacl::cuda_arg<unsigned int>(G1.handle2()),
+ viennacl::cuda_arg<NumericT>(G1.handle()),
+ static_cast<unsigned int>(G1.size1()),
+ viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ static_cast<unsigned int>(A.nnz()),
+ static_cast<unsigned int>(max_entries_in_G),
+ viennacl::cuda_arg(exclusive_scan_helper)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_G1");
+
+ // compute tmp = G1 * B;
+ // C = A2 * tmp;
+ viennacl::compressed_matrix<NumericT, AlignmentV> tmp(G1.size1(), B.size2(), 0, viennacl::traits::context(A));
+ prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
+ prod_impl(A2, tmp, C); // this may split A2 again
+ return;
+ }
+
+ subwarp_sizes.switch_memory_context(viennacl::traits::context(A));
+ max_nnz_row_A.switch_memory_context(viennacl::traits::context(A));
+ max_nnz_row_B.switch_memory_context(viennacl::traits::context(A));
+ scratchpad_offsets.switch_memory_context(viennacl::traits::context(A));
+
+ viennacl::vector<unsigned int> scratchpad_indices(scratchpad_offset, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+ std::cout << "Intermediate host stage: " << timer.get() << std::endl;
+ timer.start();
+#endif
+
+ //
+ // Stage 2: Determine pattern of C
+ //
+
+ compressed_matrix_gemm_stage_2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg<unsigned int>(B.handle2()),
+ static_cast<unsigned int>(B.size2()),
+ viennacl::cuda_arg<unsigned int>(C.handle1()),
+ viennacl::cuda_arg(subwarp_sizes),
+ viennacl::cuda_arg(max_nnz_row_A),
+ viennacl::cuda_arg(max_nnz_row_B),
+ viennacl::cuda_arg(scratchpad_offsets),
+ viennacl::cuda_arg(scratchpad_indices)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+ cudaDeviceSynchronize();
+ std::cout << "Stage 2: " << timer.get() << std::endl;
+ timer.start();
+#endif
+
+
+ // exclusive scan on C.handle1(), ultimately allowing to allocate remaining memory for C
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(C.handle1(), C.size1() + 1);
+ viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ unsigned int current_offset = 0;
+ for (std::size_t i=0; i<C.size1(); ++i)
+ {
+ unsigned int tmp = row_buffer[i];
+ row_buffer.set(i, current_offset);
+ current_offset += tmp;
+ }
+ row_buffer.set(C.size1(), current_offset);
+ viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+
+
+ //
+ // Stage 3: Compute entries in C
+ //
+ C.reserve(current_offset, false);
+
+ viennacl::vector<NumericT> scratchpad_values(scratchpad_offset, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+ std::cout << "Intermediate stage 2->3: " << timer.get() << std::endl;
+ timer.start();
+#endif
+
+ compressed_matrix_gemm_stage_3<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg<unsigned int>(B.handle2()),
+ viennacl::cuda_arg<NumericT>(B.handle()),
+ static_cast<unsigned int>(B.size2()),
+ viennacl::cuda_arg<unsigned int>(C.handle1()),
+ viennacl::cuda_arg<unsigned int>(C.handle2()),
+ viennacl::cuda_arg<NumericT>(C.handle()),
+ viennacl::cuda_arg(subwarp_sizes),
+ viennacl::cuda_arg(max_nnz_row_A),
+ viennacl::cuda_arg(max_nnz_row_B),
+ viennacl::cuda_arg(scratchpad_offsets),
+ viennacl::cuda_arg(scratchpad_indices),
+ viennacl::cuda_arg(scratchpad_values)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
+#ifdef VIENNACL_WITH_SPGEMM_CUDA_TIMINGS
+ cudaDeviceSynchronize();
+ std::cout << "Stage 3: " << timer.get() << std::endl;
+ std::cout << "----------" << std::endl;
+#endif
+
+}
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[17/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
new file mode 100644
index 0000000..56e3c14
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
@@ -0,0 +1,705 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp
+ @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/linalg/host_based/common.hpp"
+
+
+#ifdef VIENNACL_WITH_AVX2
+#include "immintrin.h"
+#endif
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+
+
+#ifdef VIENNACL_WITH_AVX2
+inline
+unsigned int row_C_scan_symbolic_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end,
+ int const *B_row_buffer, int const *B_col_buffer, int B_size2,
+ int *row_C_vector_output)
+{
+ __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+ __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
+
+ __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
+ __m256i avx_load_mask2 = avx_load_mask;
+
+ __m256i avx_row_indices = _mm256_set1_epi32(0);
+ avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
+
+ avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+ __m256i avx_index_front = avx_all_bsize2;
+ avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+ int *output_ptr = row_C_vector_output;
+
+ while (1)
+ {
+ // get minimum index in current front:
+ __m256i avx_index_min1 = avx_index_front;
+ __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
+
+ avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
+
+ avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
+
+ int min_index_in_front = ((int*)&avx_index_min1)[0];
+ // check for end of merge operation:
+ if (min_index_in_front == B_size2)
+ break;
+
+ // write current entry:
+ *output_ptr = min_index_in_front;
+ ++output_ptr;
+
+ // advance index front where equal to minimum index:
+ avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
+ // first part: set index to B_size2 if equal to minimum index:
+ avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
+ avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
+ // second part: increment row_start registers where minimum found:
+ avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
+ avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
+ // third part part: load new data where more entries available:
+ avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+ avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+ }
+
+ return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+#endif
+
+/** @brief Merges up to IndexNum rows from B into the result buffer.
+*
+* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1)
+**/
+template<unsigned int IndexNum>
+unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B,
+ unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
+ unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end,
+ unsigned int *row_C_vector_output)
+{
+ unsigned int index_front[IndexNum+1];
+ unsigned int const *index_front_start[IndexNum+1];
+ unsigned int const *index_front_end[IndexNum+1];
+
+ // Set up pointers for loading the indices:
+ for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
+ {
+ index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
+ index_front_end[i] = B_col_buffer + B_row_buffer[*row_indices_B + 1];
+ }
+ index_front_start[IndexNum] = row_C_vector_input;
+ index_front_end[IndexNum] = row_C_vector_input_end;
+
+ // load indices:
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+ unsigned int *output_ptr = row_C_vector_output;
+
+ while (1)
+ {
+ // get minimum index in current front:
+ unsigned int min_index_in_front = B_size2;
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ min_index_in_front = std::min(min_index_in_front, index_front[i]);
+
+ if (min_index_in_front == B_size2) // we're done
+ break;
+
+ // advance index front where equal to minimum index:
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ {
+ if (index_front[i] == min_index_in_front)
+ {
+ index_front_start[i] += 1;
+ index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+ }
+ }
+
+ // write current entry:
+ *output_ptr = min_index_in_front;
+ ++output_ptr;
+ }
+
+ return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+
+struct spgemm_output_write_enabled { static void apply(unsigned int *ptr, unsigned int value) { *ptr = value; } };
+struct spgemm_output_write_disabled { static void apply(unsigned int * , unsigned int ) { } };
+
+template<typename OutputWriterT>
+unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end,
+ unsigned int const *input2_begin, unsigned int const *input2_end,
+ unsigned int termination_index,
+ unsigned int *output_begin)
+{
+ unsigned int *output_ptr = output_begin;
+
+ unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
+ unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
+ while (1)
+ {
+ unsigned int min_index = std::min(val_1, val_2);
+
+ if (min_index == termination_index)
+ break;
+
+ if (min_index == val_1)
+ {
+ ++input1_begin;
+ val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
+ }
+
+ if (min_index == val_2)
+ {
+ ++input2_begin;
+ val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
+ }
+
+ // write current entry:
+ OutputWriterT::apply(output_ptr, min_index); // *output_ptr = min_index; if necessary
+ ++output_ptr;
+ }
+
+ return static_cast<unsigned int>(output_ptr - output_begin);
+}
+
+inline
+unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer,
+ unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
+ unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
+{
+ // Trivial case: row length 0:
+ if (row_start_A == row_end_A)
+ return 0;
+
+ // Trivial case: row length 1:
+ if (row_end_A - row_start_A == 1)
+ {
+ unsigned int A_col = A_col_buffer[row_start_A];
+ return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
+ }
+
+ // Optimizations for row length 2:
+ unsigned int row_C_len = 0;
+ if (row_end_A - row_start_A == 2)
+ {
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+ return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+ B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+ B_size2,
+ row_C_vector_1);
+ }
+ else // for more than two rows we can safely merge the first two:
+ {
+#ifdef VIENNACL_WITH_AVX2
+ row_C_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
+ (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
+ (int*)row_C_vector_1);
+ row_start_A += 8;
+#else
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+ B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+ B_size2,
+ row_C_vector_1);
+ row_start_A += 2;
+#endif
+ }
+
+ // all other row lengths:
+ while (row_end_A > row_start_A)
+ {
+#ifdef VIENNACL_WITH_AVX2
+ if (row_end_A - row_start_A > 2) // we deal with one or two remaining rows more efficiently below:
+ {
+ unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
+ (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
+ (int*)row_C_vector_3);
+ if (row_start_A + 8 >= row_end_A)
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ else
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ row_start_A += 8;
+ }
+ else
+#endif
+ if (row_start_A == row_end_A - 1) // last merge operation. No need to write output
+ {
+ // process last row
+ unsigned int row_index_B = A_col_buffer[row_start_A];
+ return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ }
+ else if (row_start_A + 1 < row_end_A)// at least two more rows left, so merge them
+ {
+ // process single row:
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+ unsigned int merged_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+ B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+ B_size2,
+ row_C_vector_3);
+ if (row_start_A + 2 == row_end_A) // last merge does not need a write:
+ return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ else
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ row_start_A += 2;
+ }
+ else // at least two more rows left
+ {
+ // process single row:
+ unsigned int row_index_B = A_col_buffer[row_start_A];
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ ++row_start_A;
+ }
+
+ std::swap(row_C_vector_1, row_C_vector_2);
+ }
+
+ return row_C_len;
+}
+
+//////////////////////////////
+
+/** @brief Merges up to IndexNum rows from B into the result buffer.
+*
+* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1)
+**/
+template<unsigned int IndexNum, typename NumericT>
+unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A,
+ unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
+ unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values,
+ unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
+{
+ unsigned int index_front[IndexNum+1];
+ unsigned int const *index_front_start[IndexNum+1];
+ unsigned int const *index_front_end[IndexNum+1];
+ NumericT const * value_front_start[IndexNum+1];
+ NumericT values_A[IndexNum+1];
+
+ // Set up pointers for loading the indices:
+ for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
+ {
+ unsigned int row_B = *row_indices_B;
+
+ index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
+ index_front_end[i] = B_col_buffer + B_row_buffer[row_B + 1];
+ value_front_start[i] = B_elements + B_row_buffer[row_B];
+ values_A[i] = val_A[i];
+ }
+ index_front_start[IndexNum] = row_C_vector_input;
+ index_front_end[IndexNum] = row_C_vector_input_end;
+ value_front_start[IndexNum] = row_C_vector_input_values;
+ values_A[IndexNum] = NumericT(1);
+
+ // load indices:
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+ unsigned int *output_ptr = row_C_vector_output;
+
+ while (1)
+ {
+ // get minimum index in current front:
+ unsigned int min_index_in_front = B_size2;
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ min_index_in_front = std::min(min_index_in_front, index_front[i]);
+
+ if (min_index_in_front == B_size2) // we're done
+ break;
+
+ // advance index front where equal to minimum index:
+ NumericT row_C_value = 0;
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ {
+ if (index_front[i] == min_index_in_front)
+ {
+ index_front_start[i] += 1;
+ index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+ row_C_value += values_A[i] * *value_front_start[i];
+ value_front_start[i] += 1;
+ }
+ }
+
+ // write current entry:
+ *output_ptr = min_index_in_front;
+ ++output_ptr;
+ *row_C_vector_output_values = row_C_value;
+ ++row_C_vector_output_values;
+ }
+
+ return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+
+
+
+#ifdef VIENNACL_WITH_AVX2
+inline
+unsigned int row_C_scan_numeric_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, double const *values_A,
+ int const *B_row_buffer, int const *B_col_buffer, double const *B_elements,
+ int B_size2,
+ int *row_C_vector_output, double *row_C_vector_output_values)
+{
+ __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+ __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
+
+ __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
+ __m256i avx_load_mask2 = avx_load_mask;
+
+ __m256i avx_row_indices = _mm256_set1_epi32(0);
+ avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
+
+ // load values from A:
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256d avx_value_A_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+ values_A, //base ptr
+ _mm256_extractf128_si256(avx_row_indices_offsets, 0), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256d avx_value_A_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+ values_A, //base ptr
+ _mm256_extractf128_si256(avx_row_indices_offsets, 1), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
+
+ avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+ avx_load_mask2 = avx_load_mask;
+ __m256i avx_index_front = avx_all_bsize2;
+ avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+ // load front values from B:
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256d avx_value_front_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+ B_elements, //base ptr
+ _mm256_extractf128_si256(avx_row_start, 0), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256d avx_value_front_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+ B_elements, //base ptr
+ _mm256_extractf128_si256(avx_row_start, 1), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+ int *output_ptr = row_C_vector_output;
+
+ while (1)
+ {
+ // get minimum index in current front:
+ __m256i avx_index_min1 = avx_index_front;
+ __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
+
+ avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
+
+ avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
+
+ int min_index_in_front = ((int*)&avx_index_min1)[0];
+ // check for end of merge operation:
+ if (min_index_in_front == B_size2)
+ break;
+
+ // accumulate value (can certainly be done more elegantly...)
+ double value = 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[0]) ? ((double*)&avx_value_front_low)[0] * ((double*)&avx_value_A_low)[0] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[1]) ? ((double*)&avx_value_front_low)[1] * ((double*)&avx_value_A_low)[1] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[2]) ? ((double*)&avx_value_front_low)[2] * ((double*)&avx_value_A_low)[2] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[3]) ? ((double*)&avx_value_front_low)[3] * ((double*)&avx_value_A_low)[3] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[4]) ? ((double*)&avx_value_front_high)[0] * ((double*)&avx_value_A_high)[0] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[5]) ? ((double*)&avx_value_front_high)[1] * ((double*)&avx_value_A_high)[1] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[6]) ? ((double*)&avx_value_front_high)[2] * ((double*)&avx_value_A_high)[2] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[7]) ? ((double*)&avx_value_front_high)[3] * ((double*)&avx_value_A_high)[3] : 0;
+ *row_C_vector_output_values = value;
+ ++row_C_vector_output_values;
+
+ // write current entry:
+ *output_ptr = min_index_in_front;
+ ++output_ptr;
+
+ // advance index front where equal to minimum index:
+ avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
+ // first part: set index to B_size2 if equal to minimum index:
+ avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
+ avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
+ // second part: increment row_start registers where minimum found:
+ avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
+ avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
+ // third part part: load new data where more entries available:
+ avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+ avx_load_mask2 = avx_load_mask;
+ avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+ // load new values where necessary:
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, //src
+ B_elements, //base ptr
+ _mm256_extractf128_si256(avx_row_start, 0), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, //src
+ B_elements, //base ptr
+ _mm256_extractf128_si256(avx_row_start, 1), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+ //multiply new entries:
+
+ }
+
+ return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+#endif
+
+
+template<typename NumericT>
+unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1,
+ unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2,
+ unsigned int termination_index,
+ unsigned int *output_index_begin, NumericT *output_values_begin)
+{
+ unsigned int *output_ptr = output_index_begin;
+
+ unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
+ unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
+
+ while (1)
+ {
+ unsigned int min_index = std::min(index1, index2);
+ NumericT value = 0;
+
+ if (min_index == termination_index)
+ break;
+
+ if (min_index == index1)
+ {
+ ++input1_index_begin;
+ index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
+
+ value += factor1 * *input1_values_begin;
+ ++input1_values_begin;
+ }
+
+ if (min_index == index2)
+ {
+ ++input2_index_begin;
+ index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
+
+ value += factor2 * *input2_values_begin;
+ ++input2_values_begin;
+ }
+
+ // write current entry:
+ *output_ptr = min_index;
+ ++output_ptr;
+ *output_values_begin = value;
+ ++output_values_begin;
+ }
+
+ return static_cast<unsigned int>(output_ptr - output_index_begin);
+}
+
+template<typename NumericT>
+void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements,
+ unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
+ unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements,
+ unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values,
+ unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values,
+ unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
+{
+ (void)row_end_C;
+
+ // Trivial case: row length 0:
+ if (row_start_A == row_end_A)
+ return;
+
+ // Trivial case: row length 1:
+ if (row_end_A - row_start_A == 1)
+ {
+ unsigned int A_col = A_col_buffer[row_start_A];
+ unsigned int B_end = B_row_buffer[A_col + 1];
+ NumericT A_value = A_elements[row_start_A];
+ C_col_buffer += row_start_C;
+ C_elements += row_start_C;
+ for (unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements)
+ {
+ *C_col_buffer = B_col_buffer[j];
+ *C_elements = A_value * B_elements[j];
+ }
+ return;
+ }
+
+ unsigned int row_C_len = 0;
+ if (row_end_A - row_start_A == 2) // directly merge to C:
+ {
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+ unsigned int B_offset_1 = B_row_buffer[A_col_1];
+ unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+ row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+ B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+ B_size2,
+ C_col_buffer + row_start_C, C_elements + row_start_C);
+ return;
+ }
+#ifdef VIENNACL_WITH_AVX2
+ else if (row_end_A - row_start_A > 10) // safely merge eight rows into temporary buffer:
+ {
+ row_C_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
+ (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
+ (int*)row_C_vector_1, row_C_vector_1_values);
+ row_start_A += 8;
+ }
+#endif
+ else // safely merge two rows into temporary buffer:
+ {
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+ unsigned int B_offset_1 = B_row_buffer[A_col_1];
+ unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+ row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+ B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+ B_size2,
+ row_C_vector_1, row_C_vector_1_values);
+ row_start_A += 2;
+ }
+
+ // process remaining rows:
+ while (row_end_A > row_start_A)
+ {
+#ifdef VIENNACL_WITH_AVX2
+ if (row_end_A - row_start_A > 9) // code in other if-conditionals ensures that values get written to C
+ {
+ unsigned int merged_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
+ (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
+ (int*)row_C_vector_3, row_C_vector_3_values);
+ row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
+ row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+ B_size2,
+ row_C_vector_2, row_C_vector_2_values);
+ row_start_A += 8;
+ }
+ else
+#endif
+ if (row_start_A + 1 == row_end_A) // last row to merge, write directly to C:
+ {
+ unsigned int A_col = A_col_buffer[row_start_A];
+ unsigned int B_offset = B_row_buffer[A_col];
+
+ row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
+ row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+ B_size2,
+ C_col_buffer + row_start_C, C_elements + row_start_C);
+ return;
+ }
+ else if (row_start_A + 2 < row_end_A)// at least three more rows left, so merge two
+ {
+ // process single row:
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+ unsigned int B_offset_1 = B_row_buffer[A_col_1];
+ unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+ unsigned int merged_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+ B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+ B_size2,
+ row_C_vector_3, row_C_vector_3_values);
+ row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
+ row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+ B_size2,
+ row_C_vector_2, row_C_vector_2_values);
+ row_start_A += 2;
+ }
+ else
+ {
+ unsigned int A_col = A_col_buffer[row_start_A];
+ unsigned int B_offset = B_row_buffer[A_col];
+
+ row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
+ row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+ B_size2,
+ row_C_vector_2, row_C_vector_2_values);
+ ++row_start_A;
+ }
+
+ std::swap(row_C_vector_1, row_C_vector_2);
+ std::swap(row_C_vector_1_values, row_C_vector_2_values);
+ }
+}
+
+
+} // namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
new file mode 100644
index 0000000..b4944a2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
@@ -0,0 +1,1188 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/vector_operations.hpp
+ @brief Implementations of vector operations using a plain single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include <cmath>
+#include <algorithm> //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
+ #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+ template<typename NumericT>
+ NumericT flip_sign(NumericT val) { return -val; }
+ inline unsigned long flip_sign(unsigned long val) { return val; }
+ inline unsigned int flip_sign(unsigned int val) { return val; }
+ inline unsigned short flip_sign(unsigned short val) { return val; }
+ inline unsigned char flip_sign(unsigned char val) { return val; }
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+ DestNumericT * data_dest = detail::extract_raw_pointer<DestNumericT>(dest);
+ SrcNumericT const * data_src = detail::extract_raw_pointer<SrcNumericT>(src);
+
+ vcl_size_t start_dest = viennacl::traits::start(dest);
+ vcl_size_t inc_dest = viennacl::traits::stride(dest);
+ vcl_size_t size_dest = viennacl::traits::size(dest);
+
+ vcl_size_t start_src = viennacl::traits::start(src);
+ vcl_size_t inc_src = viennacl::traits::stride(src);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size_dest > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size_dest); ++i)
+ data_dest[static_cast<vcl_size_t>(i)*inc_dest+start_dest] = static_cast<DestNumericT>(data_src[static_cast<vcl_size_t>(i)*inc_src+start_src]);
+}
+
+template<typename NumericT, typename ScalarT1>
+void av(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = detail::flip_sign(data_alpha);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ if (reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha;
+ }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t /* len_beta */, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+ value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = detail::flip_sign(data_alpha);
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = detail::flip_sign(data_beta);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ vcl_size_t start3 = viennacl::traits::start(vec3);
+ vcl_size_t inc3 = viennacl::traits::stride(vec3);
+
+ if (reciprocal_alpha)
+ {
+ if (reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+ }
+ }
+ else
+ {
+ if (reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+ }
+ }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+ value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = detail::flip_sign(data_alpha);
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = detail::flip_sign(data_beta);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ vcl_size_t start3 = viennacl::traits::start(vec3);
+ vcl_size_t inc3 = viennacl::traits::stride(vec3);
+
+ if (reciprocal_alpha)
+ {
+ if (reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+ }
+ }
+ else
+ {
+ if (reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+ }
+ }
+}
+
+
+
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1 The vector to which the value should be assigned
+* @param alpha The value to be assigned
+* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template<typename NumericT>
+void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool up_to_internal_size = false)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+ vcl_size_t loop_bound = up_to_internal_size ? vec1.internal_size() : size1; //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+ value_type data_alpha = static_cast<value_type>(alpha);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (loop_bound > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(loop_bound); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha;
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1 The first vector (or -range, or -slice)
+* @param vec2 The second vector (or -range, or -slice)
+*/
+template<typename NumericT>
+void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ {
+ value_type temp = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
+ data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = temp;
+ }
+}
+
+
+///////////////////////// Elementwise operations /////////////
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax)
+*
+* @param vec1 The result vector (or -range, or -slice)
+* @param proxy The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+ typedef NumericT value_type;
+ typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> > OpFunctor;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+ value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+ vcl_size_t inc2 = viennacl::traits::stride(proxy.lhs());
+
+ vcl_size_t start3 = viennacl::traits::start(proxy.rhs());
+ vcl_size_t inc3 = viennacl::traits::stride(proxy.rhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2], data_vec3[static_cast<vcl_size_t>(i)*inc3+start3]);
+}
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax)
+*
+* @param vec1 The result vector (or -range, or -slice)
+* @param proxy The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<OpT> > const & proxy)
+{
+ typedef NumericT value_type;
+ typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> > OpFunctor;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+ vcl_size_t inc2 = viennacl::traits::stride(proxy.lhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]);
+}
+
+
+///////////////////////// Norms and inner product ///////////////////
+
+
+//implementation of inner product:
+
+namespace detail
+{
+
+// the following circumvents problems when trying to use a variable of template parameter type for a reduction.
+// Such a behavior is not covered by the OpenMP standard, hence we manually apply some preprocessor magic to resolve the problem.
+// See https://github.com/viennacl/viennacl-dev/issues/112 for a detailed explanation and discussion.
+
+#define VIENNACL_INNER_PROD_IMPL_1(RESULTSCALART, TEMPSCALART) \
+ inline RESULTSCALART inner_prod_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1, \
+ RESULTSCALART const * data_vec2, vcl_size_t start2, vcl_size_t inc2) { \
+ TEMPSCALART temp = 0;
+
+#define VIENNACL_INNER_PROD_IMPL_2(RESULTSCALART) \
+ for (long i = 0; i < static_cast<long>(size1); ++i) \
+ temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] * data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; \
+ return static_cast<RESULTSCALART>(temp); \
+ }
+
+// char
+VIENNACL_INNER_PROD_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(char)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned char)
+
+
+// short
+VIENNACL_INNER_PROD_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(short)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned short)
+
+
+// int
+VIENNACL_INNER_PROD_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(int)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned int)
+
+
+// long
+VIENNACL_INNER_PROD_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(long)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned long)
+
+
+// float
+VIENNACL_INNER_PROD_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(float)
+
+// double
+VIENNACL_INNER_PROD_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(double)
+
+#undef VIENNACL_INNER_PROD_IMPL_1
+#undef VIENNACL_INNER_PROD_IMPL_2
+}
+
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template<typename NumericT, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> const & vec2,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ result = detail::inner_prod_impl(data_vec1, start1, inc1, size1,
+ data_vec2, start2, inc2); //Note: Assignment to result might be expensive, thus a temporary is introduced here
+}
+
+template<typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+ vector_tuple<NumericT> const & vec_tuple,
+ vector_base<NumericT> & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_x = detail::extract_raw_pointer<value_type>(x);
+
+ vcl_size_t start_x = viennacl::traits::start(x);
+ vcl_size_t inc_x = viennacl::traits::stride(x);
+ vcl_size_t size_x = viennacl::traits::size(x);
+
+ std::vector<value_type> temp(vec_tuple.const_size());
+ std::vector<value_type const *> data_y(vec_tuple.const_size());
+ std::vector<vcl_size_t> start_y(vec_tuple.const_size());
+ std::vector<vcl_size_t> stride_y(vec_tuple.const_size());
+
+ for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j)
+ {
+ data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j));
+ start_y[j] = viennacl::traits::start(vec_tuple.const_at(j));
+ stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j));
+ }
+
+ // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach...
+ for (vcl_size_t i = 0; i < size_x; ++i)
+ {
+ value_type entry_x = data_x[i*inc_x+start_x];
+ for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+ temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]];
+ }
+
+ for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+ result[j] = temp[j]; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+
+namespace detail
+{
+
+#define VIENNACL_NORM_1_IMPL_1(RESULTSCALART, TEMPSCALART) \
+ inline RESULTSCALART norm_1_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \
+ TEMPSCALART temp = 0;
+
+#define VIENNACL_NORM_1_IMPL_2(RESULTSCALART, TEMPSCALART) \
+ for (long i = 0; i < static_cast<long>(size1); ++i) \
+ temp += static_cast<TEMPSCALART>(std::fabs(static_cast<double>(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]))); \
+ return static_cast<RESULTSCALART>(temp); \
+ }
+
+// char
+VIENNACL_NORM_1_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(char, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned char, int)
+
+// short
+VIENNACL_NORM_1_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(short, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned short, int)
+
+
+// int
+VIENNACL_NORM_1_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(int, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned int, unsigned int)
+
+
+// long
+VIENNACL_NORM_1_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(long, long)
+
+VIENNACL_NORM_1_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned long, unsigned long)
+
+
+// float
+VIENNACL_NORM_1_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(float, float)
+
+// double
+VIENNACL_NORM_1_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(double, double)
+
+#undef VIENNACL_NORM_1_IMPL_1
+#undef VIENNACL_NORM_1_IMPL_2
+
+}
+
+/** @brief Computes the l^1-norm of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_1_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ result = detail::norm_1_impl(data_vec1, start1, inc1, size1); //Note: Assignment to result might be expensive, thus using a temporary for accumulation
+}
+
+
+
+namespace detail
+{
+
+#define VIENNACL_NORM_2_IMPL_1(RESULTSCALART, TEMPSCALART) \
+ inline RESULTSCALART norm_2_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \
+ TEMPSCALART temp = 0;
+
+#define VIENNACL_NORM_2_IMPL_2(RESULTSCALART, TEMPSCALART) \
+ for (long i = 0; i < static_cast<long>(size1); ++i) { \
+ RESULTSCALART data = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; \
+ temp += static_cast<TEMPSCALART>(data * data); \
+ } \
+ return static_cast<RESULTSCALART>(temp); \
+ }
+
+// char
+VIENNACL_NORM_2_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(char, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned char, int)
+
+
+// short
+VIENNACL_NORM_2_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(short, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned short, int)
+
+
+// int
+VIENNACL_NORM_2_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(int, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned int, unsigned int)
+
+
+// long
+VIENNACL_NORM_2_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(long, long)
+
+VIENNACL_NORM_2_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned long, unsigned long)
+
+
+// float
+VIENNACL_NORM_2_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(float, float)
+
+// double
+VIENNACL_NORM_2_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(double, double)
+
+#undef VIENNACL_NORM_2_IMPL_1
+#undef VIENNACL_NORM_2_IMPL_2
+
+}
+
+
+/** @brief Computes the l^2-norm of a vector - implementation
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_2_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ result = std::sqrt(detail::norm_2_impl(data_vec1, start1, inc1, size1)); //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the supremum-norm of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_inf_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t thread_count=1;
+
+ #ifdef VIENNACL_WITH_OPENMP
+ if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+ #endif
+
+ std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ id = omp_get_thread_num();
+#endif
+
+ vcl_size_t begin = (size1 * id) / thread_count;
+ vcl_size_t end = (size1 * (id + 1)) / thread_count;
+ temp[id] = 0;
+
+ for (vcl_size_t i = begin; i < end; ++i)
+ temp[id] = std::max<value_type>(temp[id], static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])))); //casting to double in order to avoid problems if T is an integer type
+ }
+ for (vcl_size_t i = 1; i < thread_count; ++i)
+ temp[0] = std::max<value_type>( temp[0], temp[i]);
+ result = temp[0];
+}
+
+//This function should return a CPU scalar, otherwise statements like
+// vcl_rhs[index_norm_inf(vcl_rhs)]
+// are ambiguous
+/** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+*
+* @param vec1 The vector
+* @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+*/
+template<typename NumericT>
+vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+ vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+ if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+
+ std::vector<value_type> temp(thread_count);
+ std::vector<vcl_size_t> index(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ id = omp_get_thread_num();
+#endif
+ vcl_size_t begin = (size1 * id) / thread_count;
+ vcl_size_t end = (size1 * (id + 1)) / thread_count;
+ index[id] = start1;
+ temp[id] = 0;
+ value_type data;
+
+ for (vcl_size_t i = begin; i < end; ++i)
+ {
+ data = static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1]))); //casting to double in order to avoid problems if T is an integer type
+ if (data > temp[id])
+ {
+ index[id] = i;
+ temp[id] = data;
+ }
+ }
+ }
+ for (vcl_size_t i = 1; i < thread_count; ++i)
+ {
+ if (temp[i] > temp[0])
+ {
+ index[0] = index[i];
+ temp[0] = temp[i];
+ }
+ }
+ return index[0];
+}
+
+/** @brief Computes the maximum of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void max_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+ if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+
+ std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ id = omp_get_thread_num();
+#endif
+ vcl_size_t begin = (size1 * id) / thread_count;
+ vcl_size_t end = (size1 * (id + 1)) / thread_count;
+ temp[id] = data_vec1[start1];
+
+ for (vcl_size_t i = begin; i < end; ++i)
+ {
+ value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function
+ temp[id] = std::max<value_type>(temp[id],v);
+ }
+ }
+ for (vcl_size_t i = 1; i < thread_count; ++i)
+ temp[0] = std::max<value_type>( temp[0], temp[i]);
+ result = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the minimum of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void min_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+ if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+
+ std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ id = omp_get_thread_num();
+#endif
+ vcl_size_t begin = (size1 * id) / thread_count;
+ vcl_size_t end = (size1 * (id + 1)) / thread_count;
+ temp[id] = data_vec1[start1];
+
+ for (vcl_size_t i = begin; i < end; ++i)
+ {
+ value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function
+ temp[id] = std::min<value_type>(temp[id],v);
+ }
+ }
+ for (vcl_size_t i = 1; i < thread_count; ++i)
+ temp[0] = std::min<value_type>( temp[0], temp[i]);
+ result = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the sum of all elements from the vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void sum_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ value_type temp = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+:temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+
+ result = temp; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes a plane rotation of two vectors.
+*
+* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param alpha The first transformation coefficient
+* @param beta The second transformation coefficient
+*/
+template<typename NumericT>
+void plane_rotation(vector_base<NumericT> & vec1,
+ vector_base<NumericT> & vec2,
+ NumericT alpha, NumericT beta)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ value_type data_alpha = alpha;
+ value_type data_beta = beta;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ {
+ value_type temp1 = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+ value_type temp2 = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
+
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha * temp1 + data_beta * temp2;
+ data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_alpha * temp2 - data_beta * temp1;
+ }
+}
+
+namespace detail
+{
+ /** @brief Implementation of inclusive_scan and exclusive_scan for the host (OpenMP) backend. */
+ template<typename NumericT>
+ void vector_scan_impl(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> & vec2,
+ bool is_inclusive)
+ {
+ NumericT const * data_vec1 = detail::extract_raw_pointer<NumericT>(vec1);
+ NumericT * data_vec2 = detail::extract_raw_pointer<NumericT>(vec2);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+ if (size1 < 1)
+ return;
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ {
+ std::vector<NumericT> thread_results(omp_get_max_threads());
+
+ // inclusive scan each thread segment:
+ #pragma omp parallel
+ {
+ vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
+ vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
+ vcl_size_t thread_stop = std::min<vcl_size_t>(thread_start + work_per_thread, size1);
+
+ NumericT thread_sum = 0;
+ for(vcl_size_t i = thread_start; i < thread_stop; i++)
+ thread_sum += data_vec1[i * inc1 + start1];
+
+ thread_results[omp_get_thread_num()] = thread_sum;
+ }
+
+ // exclusive-scan of thread results:
+ NumericT current_offset = 0;
+ for (vcl_size_t i=0; i<thread_results.size(); ++i)
+ {
+ NumericT tmp = thread_results[i];
+ thread_results[i] = current_offset;
+ current_offset += tmp;
+ }
+
+ // exclusive/inclusive scan of each segment with correct offset:
+ #pragma omp parallel
+ {
+ vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
+ vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
+ vcl_size_t thread_stop = std::min<vcl_size_t>(thread_start + work_per_thread, size1);
+
+ NumericT thread_sum = thread_results[omp_get_thread_num()];
+ if (is_inclusive)
+ {
+ for(vcl_size_t i = thread_start; i < thread_stop; i++)
+ {
+ thread_sum += data_vec1[i * inc1 + start1];
+ data_vec2[i * inc2 + start2] = thread_sum;
+ }
+ }
+ else
+ {
+ for(vcl_size_t i = thread_start; i < thread_stop; i++)
+ {
+ NumericT tmp = data_vec1[i * inc1 + start1];
+ data_vec2[i * inc2 + start2] = thread_sum;
+ thread_sum += tmp;
+ }
+ }
+ }
+ } else
+#endif
+ {
+ NumericT sum = 0;
+ if (is_inclusive)
+ {
+ for(vcl_size_t i = 0; i < size1; i++)
+ {
+ sum += data_vec1[i * inc1 + start1];
+ data_vec2[i * inc2 + start2] = sum;
+ }
+ }
+ else
+ {
+ for(vcl_size_t i = 0; i < size1; i++)
+ {
+ NumericT tmp = data_vec1[i * inc1 + start1];
+ data_vec2[i * inc2 + start2] = sum;
+ sum += tmp;
+ }
+ }
+ }
+
+ }
+}
+
+/** @brief This function implements an inclusive scan on the host using OpenMP.
+*
+* Given an element vector (x_0, x_1, ..., x_{n-1}),
+* this routine computes (x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-1})
+*
+* @param vec1 Input vector: Gets overwritten by the routine.
+* @param vec2 The output vector. Either idential to vec1 or non-overlapping.
+*/
+template<typename NumericT>
+void inclusive_scan(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> & vec2)
+{
+ detail::vector_scan_impl(vec1, vec2, true);
+}
+
+/** @brief This function implements an exclusive scan on the host using OpenMP.
+*
+* Given an element vector (x_0, x_1, ..., x_{n-1}),
+* this routine computes (0, x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-2})
+*
+* @param vec1 Input vector: Gets overwritten by the routine.
+* @param vec2 The output vector. Either idential to vec1 or non-overlapping.
+*/
+template<typename NumericT>
+void exclusive_scan(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> & vec2)
+{
+ detail::vector_scan_impl(vec1, vec2, false);
+}
+
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
new file mode 100644
index 0000000..1038b2b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
@@ -0,0 +1,228 @@
+#ifndef VIENNACL_LINALG_ICHOL_HPP_
+#define VIENNACL_LINALG_ICHOL_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ichol.hpp
+ @brief Implementations of incomplete Cholesky factorization preconditioners with static nonzero pattern.
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete Cholesky factorization with static pattern (ILU0)
+*/
+class ichol0_tag {};
+
+
+/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+ *
+ * Refer to Chih-Jen Lin and Jorge J. Mor�, Incomplete Cholesky Factorizations with Limited Memory, SIAM J. Sci. Comput., 21(1), 24\u201345
+ * for one of many descriptions of incomplete Cholesky Factorizations
+ *
+ * @param A The input matrix in CSR format
+ * // param tag An ichol0_tag in order to dispatch among several other preconditioners.
+ */
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> & A, ichol0_tag const & /* tag */)
+{
+ assert( (viennacl::traits::context(A).memory_type() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ICHOL0") );
+
+ NumericT * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ //std::cout << A.size1() << std::endl;
+ for (vcl_size_t i=0; i<A.size1(); ++i)
+ {
+ unsigned int row_i_begin = row_buffer[i];
+ unsigned int row_i_end = row_buffer[i+1];
+
+ // get a_ii:
+ NumericT a_ii = 0;
+ for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+ {
+ if (col_buffer[buf_index_aii] == i)
+ {
+ a_ii = std::sqrt(elements[buf_index_aii]);
+ elements[buf_index_aii] = a_ii;
+ break;
+ }
+ }
+
+ // Now scale column/row i, i.e. A(k, i) /= A(i, i)
+ for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+ {
+ if (col_buffer[buf_index_aii] > i)
+ elements[buf_index_aii] /= a_ii;
+ }
+
+ // Now compute A(k, j) -= A(k, i) * A(j, i) for all nonzero k, j in column i:
+ for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j)
+ {
+ unsigned int j = col_buffer[buf_index_j];
+ if (j <= i)
+ continue;
+
+ NumericT a_ji = elements[buf_index_j];
+
+ for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k)
+ {
+ unsigned int k = col_buffer[buf_index_k];
+ if (k < j)
+ continue;
+
+ NumericT a_ki = elements[buf_index_k];
+
+ //Now check whether A(k, j) is in nonzero pattern:
+ unsigned int row_j_begin = row_buffer[j];
+ unsigned int row_j_end = row_buffer[j+1];
+ for (unsigned int buf_index_kj = row_j_begin; buf_index_kj < row_j_end; ++buf_index_kj)
+ {
+ if (col_buffer[buf_index_kj] == k)
+ {
+ elements[buf_index_kj] -= a_ki * a_ji;
+ break;
+ }
+ }
+ }
+ }
+
+ }
+
+}
+
+
+/** @brief Incomplete Cholesky preconditioner class with static pattern (ICHOL0), can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ichol0_precond
+{
+ typedef typename MatrixT::value_type NumericType;
+
+public:
+ ichol0_precond(MatrixT const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::context(viennacl::MAIN_MEMORY))
+ {
+ //initialize preconditioner:
+ //std::cout << "Start CPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End CPU precond" << std::endl;
+ }
+
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle2());
+ NumericType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LLT.handle());
+
+ // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+ viennacl::linalg::host_based::detail::csr_trans_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), lower_tag());
+ viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), upper_tag());
+ }
+
+private:
+ void init(MatrixT const & mat)
+ {
+ viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+ viennacl::switch_memory_context(LLT, host_ctx);
+
+ viennacl::copy(mat, LLT);
+ viennacl::linalg::precondition(LLT, tag_);
+ }
+
+ ichol0_tag const & tag_;
+ viennacl::compressed_matrix<NumericType> LLT;
+};
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ichol0_precond< compressed_matrix<NumericT, AlignmentV> >
+{
+ typedef compressed_matrix<NumericT, AlignmentV> MatrixType;
+
+public:
+ ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+ {
+ //initialize preconditioner:
+ //std::cout << "Start GPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End GPU precond" << std::endl;
+ }
+
+ void apply(vector<NumericT> & vec) const
+ {
+ if (viennacl::traits::context(vec).memory_type() != viennacl::MAIN_MEMORY)
+ {
+ viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+ viennacl::context old_ctx = viennacl::traits::context(vec);
+
+ viennacl::switch_memory_context(vec, host_ctx);
+ viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+ viennacl::linalg::inplace_solve( LLT , vec, upper_tag());
+ viennacl::switch_memory_context(vec, old_ctx);
+ }
+ else //apply ILU0 directly:
+ {
+ // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+ viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+ viennacl::linalg::inplace_solve( LLT , vec, upper_tag());
+ }
+ }
+
+private:
+ void init(MatrixType const & mat)
+ {
+ viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+ viennacl::switch_memory_context(LLT, host_ctx);
+ LLT = mat;
+
+ viennacl::linalg::precondition(LLT, tag_);
+ }
+
+ ichol0_tag const & tag_;
+ viennacl::compressed_matrix<NumericT> LLT;
+};
+
+}
+}
+
+
+
+
+#endif
+
+
+
[05/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp
new file mode 100644
index 0000000..0b93bb8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp
@@ -0,0 +1,703 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/svd.hpp
+ * @brief OpenCL kernel file for singular value decomposition */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+template <typename StringType>
+void generate_svd_bidiag_pack(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void bidiag_pack(__global "); source.append(numeric_string); source.append("* A, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* D, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* S, \n");
+ source.append(" uint size1, \n");
+ source.append(" uint size2, \n");
+ source.append(" uint stride \n");
+ source.append(") { \n");
+ source.append(" uint size = min(size1, size2); \n");
+
+ source.append(" if(get_global_id(0) == 0) \n");
+ source.append(" S[0] = 0; \n");
+ if(is_row_major)
+ {
+ source.append(" for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n");
+ source.append(" D[i] = A[i*stride + i]; \n");
+ source.append(" S[i + 1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0; \n");
+ }
+ else
+ {
+ source.append(" for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n");
+ source.append(" D[i] = A[i*stride + i]; \n");
+ source.append(" S[i + 1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0; \n");
+ }
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_svd_col_reduce_lcl_array(StringT & source, std::string const & numeric_string)
+{
+ // calculates a sum of local array elements
+ source.append("void col_reduce_lcl_array(__local "); source.append(numeric_string); source.append("* sums, uint lcl_id, uint lcl_sz) { \n");
+ source.append(" uint step = lcl_sz >> 1; \n");
+
+ source.append(" while (step > 0) { \n");
+ source.append(" if (lcl_id < step) { \n");
+ source.append(" sums[lcl_id] += sums[lcl_id + step]; \n");
+ source.append(" } \n");
+ source.append(" step >>= 1; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_copy_col(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ // probably, this is a ugly way
+ source.append("__kernel void copy_col(__global "); source.append(numeric_string); source.append("* A, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* V, \n");
+ source.append(" uint row_start, \n");
+ source.append(" uint col_start, \n");
+ source.append(" uint size, \n");
+ source.append(" uint stride \n");
+ source.append(" ) { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+ if(is_row_major)
+ {
+ source.append(" for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n");
+ source.append(" V[i - row_start] = A[i * stride + col_start]; \n");
+ source.append(" } \n");
+ }
+ else
+ {
+ source.append(" for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n");
+ source.append(" V[i - row_start] = A[i + col_start * stride]; \n");
+ source.append(" } \n");
+ }
+
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_copy_row(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ // probably, this is too
+ source.append("__kernel void copy_row(__global "); source.append(numeric_string); source.append("* A, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* V, \n");
+ source.append(" uint row_start, \n");
+ source.append(" uint col_start, \n");
+ source.append(" uint size, \n");
+ source.append(" uint stride \n");
+ source.append(" ) { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+ if(is_row_major)
+ {
+ source.append(" for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n");
+ source.append(" V[i - col_start] = A[row_start * stride + i]; \n");
+ source.append(" } \n");
+ }
+ else
+ {
+ source.append(" for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n");
+ source.append(" V[i - col_start] = A[row_start + i * stride]; \n");
+ source.append(" } \n");
+ }
+
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_svd_final_iter_update(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void final_iter_update(__global "); source.append(numeric_string); source.append("* A, \n");
+ source.append(" uint stride, \n");
+ source.append(" uint n, \n");
+ source.append(" uint last_n, \n");
+ source.append(" "); source.append(numeric_string); source.append(" q, \n");
+ source.append(" "); source.append(numeric_string); source.append(" p \n");
+ source.append(" ) \n");
+ source.append("{ \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (uint px = glb_id; px < last_n; px += glb_sz) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" v_in = A[n * stride + px]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" z = A[(n - 1) * stride + px]; \n");
+ source.append(" A[(n - 1) * stride + px] = q * z + p * v_in; \n");
+ source.append(" A[n * stride + px] = q * v_in - p * z; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_givens_next(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void givens_next(__global "); source.append(numeric_string); source.append("* matr, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* cs, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* ss, \n");
+ source.append(" uint size, \n");
+ source.append(" uint stride, \n");
+ source.append(" uint start_i, \n");
+ source.append(" uint end_i \n");
+ source.append(" ) \n");
+ source.append("{ \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" uint j = glb_id; \n");
+
+ source.append(" __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n");
+ if(is_row_major)
+ {
+
+ source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) + j * stride] : 0; \n");
+
+ source.append(" uint elems_num = end_i - start_i + 1; \n");
+ source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
+
+ source.append(" for(uint block_id = 0; block_id < block_num; block_id++) \n");
+ source.append(" { \n");
+ source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
+
+ source.append(" if(lcl_id < to) \n");
+ source.append(" { \n");
+ source.append(" cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+ source.append(" ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if(j < size) \n");
+ source.append(" { \n");
+ source.append(" for(uint ind = 0; ind < to; ind++) \n");
+ source.append(" { \n");
+ source.append(" uint i = end_i - (ind + block_id * lcl_sz); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" z = matr[i + j * stride]; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n");
+
+ source.append(" matr[(i + 1) + j * stride] = x * cs_val + z * ss_val; \n");
+ source.append(" x = -x * ss_val + z * cs_val; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ source.append(" if(j < size) \n");
+ source.append(" matr[(start_i) + j * stride] = x; \n");
+ }
+ else
+ {
+
+ source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) * stride + j] : 0; \n");
+
+ source.append(" uint elems_num = end_i - start_i + 1; \n");
+ source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
+
+ source.append(" for(uint block_id = 0; block_id < block_num; block_id++) \n");
+ source.append(" { \n");
+ source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
+
+ source.append(" if(lcl_id < to) \n");
+ source.append(" { \n");
+ source.append(" cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+ source.append(" ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if(j < size) \n");
+ source.append(" { \n");
+ source.append(" for(uint ind = 0; ind < to; ind++) \n");
+ source.append(" { \n");
+ source.append(" uint i = end_i - (ind + block_id * lcl_sz); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n");
+
+ source.append(" matr[(i + 1) * stride + j] = x * cs_val + z * ss_val; \n");
+ source.append(" x = -x * ss_val + z * cs_val; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ source.append(" if(j < size) \n");
+ source.append(" matr[(start_i) * stride + j] = x; \n");
+ }
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_svd_givens_prev(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void givens_prev(__global "); source.append(numeric_string); source.append("* matr, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* cs, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* ss, \n");
+ source.append(" uint size, \n");
+ source.append(" uint stride, \n");
+ source.append(" uint start_i, \n");
+ source.append(" uint end_i \n");
+ source.append(" ) \n");
+ source.append("{ \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" uint j = glb_id; \n");
+
+ source.append(" __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(start_i - 1) * stride + j] : 0; \n");
+
+ source.append(" uint elems_num = end_i - start_i; \n");
+ source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n");
+
+ source.append(" for (uint block_id = 0; block_id < block_num; block_id++) \n");
+ source.append(" { \n");
+ source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n");
+
+ source.append(" if (lcl_id < to) \n");
+ source.append(" { \n");
+ source.append(" cs_lcl[lcl_id] = cs[lcl_id + start_i + block_id * lcl_sz]; \n");
+ source.append(" ss_lcl[lcl_id] = ss[lcl_id + start_i + block_id * lcl_sz]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (j < size) \n");
+ source.append(" { \n");
+ source.append(" for (uint ind = 0; ind < to; ind++) \n");
+ source.append(" { \n");
+ source.append(" uint i = ind + start_i + block_id * lcl_sz; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind];//cs[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind];//ss[i]; \n");
+
+ source.append(" matr[(i - 1) * stride + j] = x * cs_val + z * ss_val; \n");
+ source.append(" x = -x * ss_val + z * cs_val; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ source.append(" if (j < size) \n");
+ source.append(" matr[(end_i - 1) * stride + j] = x; \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_house_update_A_left(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void house_update_A_left( \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* A, \n");
+ source.append(" __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector
+ source.append(" uint row_start, \n");
+ source.append(" uint col_start, \n");
+ source.append(" uint size1, \n");
+ source.append(" uint size2, \n");
+ source.append(" uint stride, \n");
+ source.append(" __local "); source.append(numeric_string); source.append("* sums \n");
+ source.append(" ) { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+ // doing it in slightly different way to avoid cache misses
+ if(is_row_major)
+ {
+ source.append(" for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n");
+ source.append(" ss = 0; \n");
+ source.append(" for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j * stride + i]); \n");
+
+ source.append(" for(uint j = row_start; j < size1; j++) \n");
+ source.append(" A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss); \n");
+ source.append(" } \n");
+ }
+ else
+ {
+ source.append(" for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n");
+ source.append(" ss = 0; \n");
+ source.append(" for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j + i * stride]); \n");
+
+ source.append(" for(uint j = row_start; j < size1; j++) \n");
+ source.append(" A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss); \n");
+ source.append(" } \n");
+ }
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_svd_house_update_A_right(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+
+ source.append("__kernel void house_update_A_right( \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* A, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector
+ source.append(" uint row_start, \n");
+ source.append(" uint col_start, \n");
+ source.append(" uint size1, \n");
+ source.append(" uint size2, \n");
+ source.append(" uint stride, \n");
+ source.append(" __local "); source.append(numeric_string); source.append("* sums \n");
+ source.append(" ) { \n");
+
+ source.append(" uint glb_id = get_global_id(0); \n");
+
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+ // update of A matrix
+ if(is_row_major)
+ {
+ source.append(" for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n");
+ source.append(" ss = 0; \n");
+
+ source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i * stride + j]); \n");
+ source.append(" sums[lcl_id] = ss; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n");
+
+ source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) \n");
+ source.append(" A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av); \n");
+ source.append(" } \n");
+ }
+ else
+ {
+ source.append(" for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n");
+ source.append(" ss = 0; \n");
+
+ source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i + j * stride]); \n");
+ source.append(" sums[lcl_id] = ss; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n");
+
+ source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) \n");
+ source.append(" A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av); \n");
+ source.append(" } \n");
+ }
+
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_svd_house_update_QL(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void house_update_QL(\n");
+ source.append(" __global "); source.append(numeric_string); source.append("* QL, \n");
+ source.append(" __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector
+ source.append(" uint size1, \n");
+ source.append(" uint strideQ, \n");
+ source.append(" __local "); source.append(numeric_string); source.append("* sums \n");
+ source.append(" ) { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+ if(is_row_major)
+ {
+ source.append(" for(uint i = grp_id; i < size1; i += grp_nm) { \n");
+ source.append(" ss = 0; \n");
+ source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i * strideQ + j]); \n");
+ source.append(" sums[lcl_id] = ss; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
+
+ source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) \n");
+ source.append(" QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv); \n");
+ source.append(" } \n");
+ }
+ else
+ {
+ source.append(" for(uint i = grp_id; i < size1; i += grp_nm) { \n");
+ source.append(" ss = 0; \n");
+ source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i + j * strideQ]); \n");
+ source.append(" sums[lcl_id] = ss; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
+
+ source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) \n");
+ source.append(" QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv); \n");
+ source.append(" } \n");
+ }
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_svd_house_update_QR(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void house_update_QR( \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* QR, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector
+ source.append(" uint size1, \n");
+ source.append(" uint size2, \n");
+ source.append(" uint strideQ, \n");
+ source.append(" __local "); source.append(numeric_string); source.append("* sums \n");
+ source.append(" ) { \n");
+
+ source.append(" uint glb_id = get_global_id(0); \n");
+
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n");
+
+ // update of QR matrix
+ // Actually, we are calculating a transpose of right matrix. This allows to avoid cache
+ // misses.
+ source.append(" for (uint i = grp_id; i < size2; i += grp_nm) { \n");
+ source.append(" ss = 0; \n");
+ source.append(" for (uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * QR[i * strideQ + j]); \n");
+ source.append(" sums[lcl_id] = ss; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n");
+ source.append(" for (uint j = lcl_id; j < size2; j += lcl_sz) \n");
+ source.append(" QR[i * strideQ + j] = QR[i * strideQ + j] - (2 * V[j] * sum_Qv); \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_svd_inverse_signs(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void inverse_signs(__global "); source.append(numeric_string); source.append("* v, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* signs, \n");
+ source.append(" uint size, \n");
+ source.append(" uint stride \n");
+ source.append(" ) \n");
+ source.append("{ \n");
+ source.append(" uint glb_id_x = get_global_id(0); \n");
+ source.append(" uint glb_id_y = get_global_id(1); \n");
+
+ source.append(" if ((glb_id_x < size) && (glb_id_y < size)) \n");
+ source.append(" v[glb_id_x * stride + glb_id_y] *= signs[glb_id_x]; \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_svd_transpose_inplace(StringT & source, std::string const & numeric_string)
+{
+
+ source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("* input, \n");
+ source.append(" unsigned int row_num, \n");
+ source.append(" unsigned int col_num) { \n");
+ source.append(" unsigned int size = row_num * col_num; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n");
+ source.append(" unsigned int row = i / col_num; \n");
+ source.append(" unsigned int col = i - row*col_num; \n");
+
+ source.append(" unsigned int new_pos = col * row_num + row; \n");
+
+ //new_pos = (col < row) ? 0 : 1;
+ //input[i] = new_pos;
+
+ source.append(" if (i < new_pos) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = input[i]; \n");
+ source.append(" input[i] = input[new_pos]; \n");
+ source.append(" input[new_pos] = val; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_svd_update_qr_column(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void update_qr_column(__global "); source.append(numeric_string); source.append("* A, \n");
+ source.append(" uint stride, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* buf, \n");
+ source.append(" int m, \n");
+ source.append(" int n, \n");
+ source.append(" int last_n) \n");
+ source.append("{ \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (int i = glb_id; i < last_n; i += glb_sz) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" a_ik = A[m * stride + i], a_ik_1, a_ik_2; \n");
+
+ source.append(" a_ik_1 = A[(m + 1) * stride + i]; \n");
+
+ source.append(" for (int k = m; k < n; k++) \n");
+ source.append(" { \n");
+ source.append(" bool notlast = (k != n - 1); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" p = buf[5 * k] * a_ik + buf[5 * k + 1] * a_ik_1; \n");
+
+ source.append(" if (notlast) \n");
+ source.append(" { \n");
+ source.append(" a_ik_2 = A[(k + 2) * stride + i]; \n");
+ source.append(" p = p + buf[5 * k + 2] * a_ik_2; \n");
+ source.append(" a_ik_2 = a_ik_2 - p * buf[5 * k + 4]; \n");
+ source.append(" } \n");
+
+ source.append(" A[k * stride + i] = a_ik - p; \n");
+ source.append(" a_ik_1 = a_ik_1 - p * buf[5 * k + 3]; \n");
+
+ source.append(" a_ik = a_ik_1; \n");
+ source.append(" a_ik_1 = a_ik_2; \n");
+ source.append(" } \n");
+
+ source.append(" A[n * stride + i] = a_ik; \n");
+ source.append(" } \n");
+
+ source.append("} \n");
+}
+
+
+
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices. */
+template<typename NumericT, typename MatrixLayout = row_major>
+struct svd
+{
+ static std::string program_name()
+ {
+ bool is_row = viennacl::is_row_major<MatrixLayout>::value;
+ return (viennacl::ocl::type_to_string<NumericT>::apply() + "_svd_") + (is_row ? "row" : "col");
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+ bool is_row_major = viennacl::is_row_major<MatrixLayout>::value;
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // only generate for floating points (forces error for integers)
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ //helper function used by multiple kernels:
+ generate_svd_col_reduce_lcl_array(source, numeric_string);
+
+ //kernels:
+ generate_svd_bidiag_pack(source, numeric_string, is_row_major);
+ generate_svd_copy_col(source, numeric_string, is_row_major);
+ generate_svd_copy_row(source, numeric_string, is_row_major);
+ generate_svd_final_iter_update(source, numeric_string);
+ generate_svd_givens_next(source, numeric_string, is_row_major);
+ generate_svd_givens_prev(source, numeric_string);
+ generate_svd_house_update_A_left(source, numeric_string, is_row_major);
+ generate_svd_house_update_A_right(source, numeric_string, is_row_major);
+ generate_svd_house_update_QL(source, numeric_string, is_row_major);
+ generate_svd_house_update_QR(source, numeric_string);
+ generate_svd_inverse_signs(source, numeric_string);
+ generate_svd_transpose_inplace(source, numeric_string);
+ generate_svd_update_qr_column(source, numeric_string);
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp
new file mode 100644
index 0000000..b6a2b7d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp
@@ -0,0 +1,867 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/io.hpp"
+#include "viennacl/scheduler/preset.hpp"
+
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+
+
+/** @file viennacl/linalg/opencl/kernels/vector.hpp
+ * @brief OpenCL kernel file for vector operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+/** @brief Enumeration for the scalar type in avbv-like operations */
+enum avbv_scalar_type
+{
+ VIENNACL_AVBV_NONE = 0, // vector does not exist/contribute
+ VIENNACL_AVBV_CPU,
+ VIENNACL_AVBV_GPU
+};
+
+/** @brief Configuration struct for generating OpenCL kernels for linear combinations of vectors */
+struct avbv_config
+{
+ avbv_config() : with_stride_and_range(true), a(VIENNACL_AVBV_CPU), b(VIENNACL_AVBV_NONE) {}
+
+ bool with_stride_and_range;
+ std::string assign_op;
+ avbv_scalar_type a;
+ avbv_scalar_type b;
+};
+
+// just returns the for-loop
+template <typename StringType>
+void generate_avbv_impl2(StringType & source, std::string const & /*numeric_string*/, avbv_config const & cfg, bool mult_alpha, bool mult_beta)
+{
+ source.append(" for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
+ if (cfg.with_stride_and_range)
+ {
+ source.append(" vec1[i*size1.y+size1.x] "); source.append(cfg.assign_op); source.append(" vec2[i*size2.y+size2.x] ");
+ if (mult_alpha)
+ source.append("* alpha ");
+ else
+ source.append("/ alpha ");
+ if (cfg.b != VIENNACL_AVBV_NONE)
+ {
+ source.append("+ vec3[i*size3.y+size3.x] ");
+ if (mult_beta)
+ source.append("* beta");
+ else
+ source.append("/ beta");
+ }
+ }
+ else
+ {
+ source.append(" vec1[i] "); source.append(cfg.assign_op); source.append(" vec2[i] ");
+ if (mult_alpha)
+ source.append("* alpha ");
+ else
+ source.append("/ alpha ");
+ if (cfg.b != VIENNACL_AVBV_NONE)
+ {
+ source.append("+ vec3[i] ");
+ if (mult_beta)
+ source.append("* beta");
+ else
+ source.append("/ beta");
+ }
+ }
+ source.append("; \n");
+}
+
+template <typename StringType>
+void generate_avbv_impl(StringType & source, std::string const & numeric_string, avbv_config const & cfg)
+{
+ source.append("__kernel void av");
+ if (cfg.b != VIENNACL_AVBV_NONE)
+ source.append("bv");
+ if (cfg.assign_op != "=")
+ source.append("_v");
+
+ if (cfg.a == VIENNACL_AVBV_CPU)
+ source.append("_cpu");
+ else if (cfg.a == VIENNACL_AVBV_GPU)
+ source.append("_gpu");
+
+ if (cfg.b == VIENNACL_AVBV_CPU)
+ source.append("_cpu");
+ else if (cfg.b == VIENNACL_AVBV_GPU)
+ source.append("_gpu");
+ source.append("( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" uint4 size1, \n");
+ source.append(" \n");
+ if (cfg.a == VIENNACL_AVBV_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" fac2, \n");
+ }
+ else if (cfg.a == VIENNACL_AVBV_GPU)
+ {
+ source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n");
+ }
+ source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+ source.append(" uint4 size2");
+
+ if (cfg.b != VIENNACL_AVBV_NONE)
+ {
+ source.append(", \n\n");
+ if (cfg.b == VIENNACL_AVBV_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" fac3, \n");
+ }
+ else if (cfg.b == VIENNACL_AVBV_GPU)
+ {
+ source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n");
+ }
+ source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec3, \n");
+ source.append(" uint4 size3 \n");
+ }
+ source.append(") { \n");
+
+ if (cfg.a == VIENNACL_AVBV_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+ }
+ else if (cfg.a == VIENNACL_AVBV_GPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+ }
+ source.append(" if (options2 & (1 << 0)) \n");
+ source.append(" alpha = -alpha; \n");
+ source.append(" \n");
+
+ if (cfg.b == VIENNACL_AVBV_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n");
+ }
+ else if (cfg.b == VIENNACL_AVBV_GPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+ }
+ if (cfg.b != VIENNACL_AVBV_NONE)
+ {
+ source.append(" if (options3 & (1 << 0)) \n");
+ source.append(" beta = -beta; \n");
+ source.append(" \n");
+ }
+ source.append(" if (options2 & (1 << 1)) { \n");
+ if (cfg.b != VIENNACL_AVBV_NONE)
+ {
+ source.append(" if (options3 & (1 << 1)) {\n");
+ generate_avbv_impl2(source, numeric_string, cfg, false, false);
+ source.append(" } else {\n");
+ generate_avbv_impl2(source, numeric_string, cfg, false, true);
+ source.append(" } \n");
+ }
+ else
+ generate_avbv_impl2(source, numeric_string, cfg, false, true);
+ source.append(" } else { \n");
+ if (cfg.b != VIENNACL_AVBV_NONE)
+ {
+ source.append(" if (options3 & (1 << 1)) {\n");
+ generate_avbv_impl2(source, numeric_string, cfg, true, false);
+ source.append(" } else {\n");
+ generate_avbv_impl2(source, numeric_string, cfg, true, true);
+ source.append(" } \n");
+ }
+ else
+ generate_avbv_impl2(source, numeric_string, cfg, true, true);
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_avbv(StringType & source, std::string const & numeric_string)
+{
+ avbv_config cfg;
+ cfg.assign_op = "=";
+ cfg.with_stride_and_range = true;
+
+ // av
+ cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+ cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+
+ // avbv
+ cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+
+ // avbv
+ cfg.assign_op = "+=";
+
+ cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
+}
+
+template <typename StringType>
+void generate_plane_rotation(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void plane_rotation( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n");
+ source.append(" unsigned int start2, \n");
+ source.append(" unsigned int inc2, \n");
+ source.append(" unsigned int size2, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
+ source.append(" "); source.append(numeric_string); source.append(" beta) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp1 = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp2 = 0; \n");
+ source.append(" \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" tmp1 = vec1[i*inc1+start1]; \n");
+ source.append(" tmp2 = vec2[i*inc2+start2]; \n");
+ source.append(" \n");
+ source.append(" vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n");
+ source.append(" vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n");
+ source.append(" } \n");
+ source.append(" \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_vector_swap(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void swap( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n");
+ source.append(" unsigned int start2, \n");
+ source.append(" unsigned int inc2, \n");
+ source.append(" unsigned int size2 \n");
+ source.append(" ) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" tmp = vec2[i*inc2+start2]; \n");
+ source.append(" vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n");
+ source.append(" vec1[i*inc1+start1] = tmp; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_assign_cpu(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void assign_cpu( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" unsigned int internal_size1, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n");
+ source.append(" vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n");
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_inner_prod(StringType & source, std::string const & numeric_string, vcl_size_t vector_num)
+{
+ std::stringstream ss;
+ ss << vector_num;
+ std::string vector_num_string = ss.str();
+
+ source.append("__kernel void inner_prod"); source.append(vector_num_string); source.append("( \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 params_x, \n");
+ for (vcl_size_t i=0; i<vector_num; ++i)
+ {
+ ss.str("");
+ ss << i;
+ source.append(" __global const "); source.append(numeric_string); source.append(" * y"); source.append(ss.str()); source.append(", \n");
+ source.append(" uint4 params_y"); source.append(ss.str()); source.append(", \n");
+ }
+ source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
+ source.append("{ \n");
+ source.append(" unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n");
+ source.append(" unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n");
+ source.append(" unsigned int vec_stop_index = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n");
+
+ // compute partial results within group:
+ for (vcl_size_t i=0; i<vector_num; ++i)
+ {
+ ss.str("");
+ ss << i;
+ source.append(" "); source.append(numeric_string); source.append(" tmp"); source.append(ss.str()); source.append(" = 0; \n");
+ }
+ source.append(" for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val_x = x[i*params_x.y + params_x.x]; \n");
+ for (vcl_size_t i=0; i<vector_num; ++i)
+ {
+ ss.str("");
+ ss << i;
+ source.append(" tmp"); source.append(ss.str()); source.append(" += val_x * y"); source.append(ss.str()); source.append("[i * params_y"); source.append(ss.str()); source.append(".y + params_y"); source.append(ss.str()); source.append(".x]; \n");
+ }
+ source.append(" } \n");
+ for (vcl_size_t i=0; i<vector_num; ++i)
+ {
+ ss.str("");
+ ss << i;
+ source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append("; \n");
+ }
+
+ // now run reduction:
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ for (vcl_size_t i=0; i<vector_num; ++i)
+ {
+ ss.str("");
+ ss << i;
+ source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0) + stride]; \n");
+ }
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (get_local_id(0) == 0) { \n");
+ for (vcl_size_t i=0; i<vector_num; ++i)
+ {
+ ss.str("");
+ ss << i;
+ source.append(" group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(" * get_local_size(0)]; \n");
+ }
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_norm(StringType & source, std::string const & numeric_string)
+{
+ bool is_float_or_double = (numeric_string == "float" || numeric_string == "double");
+
+ source.append(numeric_string); source.append(" impl_norm( \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" unsigned int norm_selector, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n");
+ source.append(" if (norm_selector == 1) \n"); //norm_1
+ source.append(" { \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+ if (is_float_or_double)
+ source.append(" tmp += fabs(vec[i*inc1 + start1]); \n");
+ else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
+ source.append(" tmp += vec[i*inc1 + start1]; \n");
+ else
+ source.append(" tmp += abs(vec[i*inc1 + start1]); \n");
+ source.append(" } \n");
+ source.append(" else if (norm_selector == 2) \n"); //norm_2
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" vec_entry = 0; \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+ source.append(" { \n");
+ source.append(" vec_entry = vec[i*inc1 + start1]; \n");
+ source.append(" tmp += vec_entry * vec_entry; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" else if (norm_selector == 0) \n"); //norm_inf
+ source.append(" { \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
+ if (is_float_or_double)
+ source.append(" tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n");
+ else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
+ source.append(" tmp = max(vec[i*inc1 + start1], tmp); \n");
+ else
+ {
+ source.append(" tmp = max(("); source.append(numeric_string); source.append(")abs(vec[i*inc1 + start1]), tmp); \n");
+ }
+ source.append(" } \n");
+
+ source.append(" tmp_buffer[get_local_id(0)] = tmp; \n");
+
+ source.append(" if (norm_selector > 0) \n"); //norm_1 or norm_2:
+ source.append(" { \n");
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n");
+ source.append(" } \n");
+ source.append(" return tmp_buffer[0]; \n");
+ source.append(" } \n");
+
+ //norm_inf:
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ if (is_float_or_double)
+ source.append(" tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
+ else
+ source.append(" tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
+ source.append(" } \n");
+
+ source.append(" return tmp_buffer[0]; \n");
+ source.append("}; \n");
+
+ source.append("__kernel void norm( \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" unsigned int norm_selector, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp = impl_norm(vec, \n");
+ source.append(" ( get_group_id(0) * size1) / get_num_groups(0) * inc1 + start1, \n");
+ source.append(" inc1, \n");
+ source.append(" ( (1 + get_group_id(0)) * size1) / get_num_groups(0) \n");
+ source.append(" - ( get_group_id(0) * size1) / get_num_groups(0), \n");
+ source.append(" norm_selector, \n");
+ source.append(" tmp_buffer); \n");
+
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" group_buffer[get_group_id(0)] = tmp; \n");
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_inner_prod_sum(StringType & source, std::string const & numeric_string)
+{
+ // sums the array 'vec1' and writes to result. Each work group computes the inner product for a subvector of size 'size_per_workgroup'.
+ source.append("__kernel void sum_inner_prod( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" unsigned int size_per_workgroup, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int start_result, \n");
+ source.append(" unsigned int inc_result) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
+ source.append(" for (unsigned int i = get_local_id(0); i<size_per_workgroup; i += get_local_size(0)) \n");
+ source.append(" thread_sum += vec1[size_per_workgroup * get_group_id(0) + i]; \n");
+
+ source.append(" tmp_buffer[get_local_id(0)] = thread_sum; \n");
+
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n");
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_sum(StringType & source, std::string const & numeric_string)
+{
+ // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+ source.append("__kernel void sum( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" unsigned int option, \n"); //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
+ source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n");
+ source.append(" for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n");
+ source.append(" { \n");
+ source.append(" if (option > 0) \n");
+ source.append(" thread_sum += vec1[i*inc1+start1]; \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" tmp = vec1[i*inc1+start1]; \n");
+ source.append(" tmp = (tmp < 0) ? -tmp : tmp; \n");
+ source.append(" thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" tmp_buffer[get_local_id(0)] = thread_sum; \n");
+
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" { \n");
+ source.append(" if (option > 0) \n");
+ source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
+ source.append(" else \n");
+ source.append(" tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (get_global_id(0) == 0) \n");
+ source.append(" { \n");
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ source.append(" if (option == 2) \n");
+ source.append(" *result = sqrt(tmp_buffer[0]); \n");
+ source.append(" else \n");
+ }
+ source.append(" *result = tmp_buffer[0]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_index_norm_inf(StringType & source, std::string const & numeric_string)
+{
+ //index_norm_inf:
+ source.append("unsigned int index_norm_inf_impl( \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
+ source.append(" __local unsigned int * index_buffer) \n");
+ source.append("{ \n");
+ //step 1: fill buffer:
+ source.append(" "); source.append(numeric_string); source.append(" cur_max = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ if (numeric_string == "float" || numeric_string == "double")
+ source.append(" tmp = fabs(vec[i*inc1+start1]); \n");
+ else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types
+ source.append(" tmp = vec[i*inc1+start1]; \n");
+ else
+ source.append(" tmp = abs(vec[i*inc1+start1]); \n");
+ source.append(" if (cur_max < tmp) \n");
+ source.append(" { \n");
+ source.append(" entry_buffer[get_global_id(0)] = tmp; \n");
+ source.append(" index_buffer[get_global_id(0)] = i; \n");
+ source.append(" cur_max = tmp; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ //step 2: parallel reduction:
+ source.append(" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_global_id(0) < stride) \n");
+ source.append(" { \n");
+ //find the first occurring index
+ source.append(" if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n");
+ source.append(" { \n");
+ source.append(" index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n");
+ source.append(" entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" \n");
+ source.append(" return index_buffer[0]; \n");
+ source.append("} \n");
+
+ source.append("__kernel void index_norm_inf( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
+ source.append(" __local unsigned int * index_buffer, \n");
+ source.append(" __global unsigned int * result) \n");
+ source.append("{ \n");
+ source.append(" entry_buffer[get_global_id(0)] = 0; \n");
+ source.append(" index_buffer[get_global_id(0)] = 0; \n");
+ source.append(" unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n");
+ source.append(" if (get_global_id(0) == 0) *result = tmp; \n");
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_maxmin(StringType & source, std::string const & numeric_string, bool is_max)
+{
+ // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+ if (is_max)
+ source.append("__kernel void max_kernel( \n");
+ else
+ source.append("__kernel void min_kernel( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" thread_result = vec1[start1]; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i<size1; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[i*inc1+start1]; \n");
+ if (is_max)
+ source.append(" thread_result = thread_result > tmp ? thread_result : tmp; \n");
+ else
+ source.append(" thread_result = thread_result < tmp ? thread_result : tmp; \n");
+ source.append(" } \n");
+
+ source.append(" tmp_buffer[get_local_id(0)] = thread_result; \n");
+
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" { \n");
+ if (is_max)
+ source.append(" tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
+ else
+ source.append(" tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] < tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" result[get_group_id(0)] = tmp_buffer[0]; \n");
+ source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without involving matrices, multiple inner products, or element-wise operations other than addition or subtraction. */
+template<typename NumericT>
+struct vector
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // fully parametrized kernels:
+ generate_avbv(source, numeric_string);
+
+ // kernels with mostly predetermined skeleton:
+ generate_plane_rotation(source, numeric_string);
+ generate_vector_swap(source, numeric_string);
+ generate_assign_cpu(source, numeric_string);
+
+ generate_inner_prod(source, numeric_string, 1);
+ generate_norm(source, numeric_string);
+ generate_sum(source, numeric_string);
+ generate_index_norm_inf(source, numeric_string);
+ generate_maxmin(source, numeric_string, true);
+ generate_maxmin(source, numeric_string, false);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+// class with kernels for multiple inner products.
+/** @brief Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<>. */
+template<typename NumericT>
+struct vector_multi_inner_prod
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector_multi";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ generate_inner_prod(source, numeric_string, 2);
+ generate_inner_prod(source, numeric_string, 3);
+ generate_inner_prod(source, numeric_string, 4);
+ generate_inner_prod(source, numeric_string, 8);
+
+ generate_inner_prod_sum(source, numeric_string);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+
+template<typename StringT>
+void generate_vector_convert(StringT & source, std::string const & dest_type, std::string const & src_type)
+{
+ source.append(" __kernel void convert_" + dest_type + "_" + src_type + "( \n");
+ source.append(" __global " + dest_type + " * dest, \n");
+ source.append(" unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, \n");
+ source.append(" __global const " + src_type + " * src, \n");
+ source.append(" unsigned int start_src, unsigned int inc_src) \n");
+ source.append(" { \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size_dest; i += get_global_size(0)) \n");
+ source.append(" dest[start_dest + i * inc_dest] = src[start_src + i * inc_src]; \n");
+ source.append(" } \n");
+}
+
+/** @brief Main kernel class for vector conversion routines (e.g. convert vector<int> to vector<float>). */
+struct vector_convert
+{
+
+public:
+ static std::string program_name()
+ {
+ return "vector_convert";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ std::string source;
+ source.reserve(4096);
+
+ // int
+ generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ // unsigned int
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ // long
+ generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ // unsigned long
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ // float
+ generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ if (ctx.current_device().double_support())
+ {
+ viennacl::ocl::append_double_precision_pragma<double>(ctx, source);
+
+ generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<double>::apply());
+
+ generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<float>::apply());
+ generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+
+};
+
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp
new file mode 100644
index 0000000..8445302
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp
@@ -0,0 +1,163 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_ELEMENT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_ELEMENT_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/vector_element.hpp
+ * @brief OpenCL kernel file for element-wise vector operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+//generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D)
+template <typename StringT>
+void generate_vector_unary_element_ops(StringT & source, std::string const & numeric_string,
+ std::string const & funcname, std::string const & op, std::string const & op_name)
+{
+ source.append("__kernel void "); source.append(funcname); source.append("_"); source.append(op_name); source.append("(\n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" uint4 size1, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n");
+ source.append(" uint4 size2) { \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
+ if (numeric_string[0] == 'u' && funcname == "abs") // abs() on unsigned does not work on MacOS X 10.6.8, so we use the identity:
+ {
+ source.append(" vec1[i*size1.y+size1.x] "); source.append(op); source.append(" vec2[i*size2.y+size2.x]; \n");
+ }
+ else
+ {
+ source.append(" vec1[i*size1.y+size1.x] "); source.append(op); source.append(" "); source.append(funcname); source.append("(vec2[i*size2.y+size2.x]); \n");
+ }
+ source.append("} \n");
+}
+
+template <typename StringT>
+void generate_vector_unary_element_ops(StringT & source, std::string const & numeric_string, std::string const & funcname)
+{
+ generate_vector_unary_element_ops(source, numeric_string, funcname, "=", "assign");
+ //generate_vector_unary_element_ops(source, numeric_string, funcname, "+=", "plus");
+ //generate_vector_unary_element_ops(source, numeric_string, funcname, "-=", "minus");
+}
+
+template <typename StringT>
+void generate_vector_binary_element_ops(StringT & source, std::string const & numeric_string, int op_type) //op_type: {0: product, 1: division, 2: power}
+{
+ std::string kernel_name_suffix;
+ if (op_type == 0)
+ kernel_name_suffix = "prod";
+ else if (op_type == 1)
+ kernel_name_suffix = "div";
+ else
+ kernel_name_suffix = "pow";
+
+ // generic kernel for the vector operation v1 = alpha * v2 + beta * v3, where v1, v2, v3 are not necessarily distinct vectors
+ source.append("__kernel void element_" + kernel_name_suffix + "(\n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+ source.append(" unsigned int start2, \n");
+ source.append(" unsigned int inc2, \n");
+
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec3, \n");
+ source.append(" unsigned int start3, \n");
+ source.append(" unsigned int inc3, \n");
+
+ source.append(" unsigned int op_type) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
+ if (op_type == 0)
+ source.append(" vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; \n");
+ else if (op_type == 1)
+ source.append(" vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; \n");
+ else if (op_type == 2)
+ source.append(" vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]); \n");
+
+ source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for elementwise operations other than addition and subtraction on/with viennacl::vector<>. */
+template<typename NumericT>
+struct vector_element
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector_element";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // unary operations
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ generate_vector_unary_element_ops(source, numeric_string, "acos");
+ generate_vector_unary_element_ops(source, numeric_string, "asin");
+ generate_vector_unary_element_ops(source, numeric_string, "atan");
+ generate_vector_unary_element_ops(source, numeric_string, "ceil");
+ generate_vector_unary_element_ops(source, numeric_string, "cos");
+ generate_vector_unary_element_ops(source, numeric_string, "cosh");
+ generate_vector_unary_element_ops(source, numeric_string, "exp");
+ generate_vector_unary_element_ops(source, numeric_string, "fabs");
+ generate_vector_unary_element_ops(source, numeric_string, "floor");
+ generate_vector_unary_element_ops(source, numeric_string, "log");
+ generate_vector_unary_element_ops(source, numeric_string, "log10");
+ generate_vector_unary_element_ops(source, numeric_string, "sin");
+ generate_vector_unary_element_ops(source, numeric_string, "sinh");
+ generate_vector_unary_element_ops(source, numeric_string, "sqrt");
+ generate_vector_unary_element_ops(source, numeric_string, "tan");
+ generate_vector_unary_element_ops(source, numeric_string, "tanh");
+ }
+ else
+ {
+ generate_vector_unary_element_ops(source, numeric_string, "abs");
+ }
+
+ // binary operations
+ generate_vector_binary_element_ops(source, numeric_string, 0);
+ generate_vector_binary_element_ops(source, numeric_string, 1);
+ if (numeric_string == "float" || numeric_string == "double")
+ generate_vector_binary_element_ops(source, numeric_string, 2);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
[45/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
new file mode 100644
index 0000000..2a24a4e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/coordinate_matrix.hpp
@@ -0,0 +1,506 @@
+#ifndef VIENNACL_COORDINATE_MATRIX_HPP_
+#define VIENNACL_COORDINATE_MATRIX_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/coordinate_matrix.hpp
+ @brief Implementation of the coordinate_matrix class
+*/
+
+#include <map>
+#include <vector>
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+
+
+//provide copy-operation:
+/** @brief Copies a sparse matrix from the host to the OpenCL device (either GPU or multi-core CPU)
+ *
+ * For the requirements on the CPUMatrixT type, see the documentation of the function copy(CPUMatrixT, compressed_matrix<>)
+ *
+ * @param cpu_matrix A sparse matrix on the host.
+ * @param gpu_matrix A compressed_matrix from ViennaCL
+ */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT & cpu_matrix,
+ coordinate_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+ assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ vcl_size_t group_num = 64;
+
+ // Step 1: Determine nonzeros:
+ if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 )
+ {
+ vcl_size_t num_entries = 0;
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ ++num_entries;
+
+ // Step 2: Set up matrix data:
+ gpu_matrix.nonzeros_ = num_entries;
+ gpu_matrix.rows_ = cpu_matrix.size1();
+ gpu_matrix.cols_ = cpu_matrix.size2();
+
+ viennacl::backend::typesafe_host_array<unsigned int> group_boundaries(gpu_matrix.handle3(), group_num + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> coord_buffer(gpu_matrix.handle12(), 2*gpu_matrix.internal_nnz());
+ std::vector<NumericT> elements(gpu_matrix.internal_nnz());
+
+ vcl_size_t data_index = 0;
+ vcl_size_t current_fraction = 0;
+
+ group_boundaries.set(0, 0);
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+ {
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ {
+ coord_buffer.set(2*data_index, col_it.index1());
+ coord_buffer.set(2*data_index + 1, col_it.index2());
+ elements[data_index] = *col_it;
+ ++data_index;
+ }
+
+ while (data_index > vcl_size_t(static_cast<double>(current_fraction + 1) / static_cast<double>(group_num)) * num_entries) //split data equally over 64 groups
+ group_boundaries.set(++current_fraction, data_index);
+ }
+
+ //write end of last group:
+ group_boundaries.set(group_num, data_index);
+ //group_boundaries[1] = data_index; //for one compute unit
+
+ //std::cout << "Group boundaries: " << std::endl;
+ //for (vcl_size_t i=0; i<group_boundaries.size(); ++i)
+ // std::cout << group_boundaries[i] << std::endl;
+
+ viennacl::backend::memory_create(gpu_matrix.group_boundaries_, group_boundaries.raw_size(), traits::context(gpu_matrix.group_boundaries_), group_boundaries.get());
+ viennacl::backend::memory_create(gpu_matrix.coord_buffer_, coord_buffer.raw_size(), traits::context(gpu_matrix.coord_buffer_), coord_buffer.get());
+ viennacl::backend::memory_create(gpu_matrix.elements_, sizeof(NumericT)*elements.size(), traits::context(gpu_matrix.elements_), &(elements[0]));
+ }
+}
+
+/** @brief Copies a sparse matrix in the std::vector< std::map < > > format to an OpenCL device.
+ *
+ * @param cpu_matrix A sparse square matrix on the host.
+ * @param gpu_matrix A coordinate_matrix from ViennaCL
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const std::vector< std::map<unsigned int, NumericT> > & cpu_matrix,
+ coordinate_matrix<NumericT, AlignmentV> & gpu_matrix )
+{
+ vcl_size_t max_col = 0;
+ for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+ {
+ if (cpu_matrix[i].size() > 0)
+ max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+ }
+
+ viennacl::copy(tools::const_sparse_matrix_adapter<NumericT>(cpu_matrix, cpu_matrix.size(), max_col + 1), gpu_matrix);
+}
+
+//gpu to cpu:
+/** @brief Copies a sparse matrix from the OpenCL device (either GPU or multi-core CPU) to the host.
+ *
+ * There are two type requirements on the CPUMatrixT type (fulfilled by e.g. boost::numeric::ublas):
+ * - resize(rows, cols) A resize function to bring the matrix into the correct size
+ * - operator(i,j) Write new entries via the parenthesis operator
+ *
+ * @param gpu_matrix A coordinate_matrix from ViennaCL
+ * @param cpu_matrix A sparse matrix on the host.
+ */
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const coordinate_matrix<NumericT, AlignmentV> & gpu_matrix,
+ CPUMatrixT & cpu_matrix )
+{
+ assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 )
+ {
+ //get raw data from memory:
+ viennacl::backend::typesafe_host_array<unsigned int> coord_buffer(gpu_matrix.handle12(), 2*gpu_matrix.nnz());
+ std::vector<NumericT> elements(gpu_matrix.nnz());
+
+ //std::cout << "GPU nonzeros: " << gpu_matrix.nnz() << std::endl;
+
+ viennacl::backend::memory_read(gpu_matrix.handle12(), 0, coord_buffer.raw_size(), coord_buffer.get());
+ viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT) * elements.size(), &(elements[0]));
+
+ //fill the cpu_matrix:
+ for (vcl_size_t index = 0; index < gpu_matrix.nnz(); ++index)
+ cpu_matrix(coord_buffer[2*index], coord_buffer[2*index+1]) = elements[index];
+
+ }
+}
+
+/** @brief Copies a sparse matrix from an OpenCL device to the host. The host type is the std::vector< std::map < > > format .
+ *
+ * @param gpu_matrix A coordinate_matrix from ViennaCL
+ * @param cpu_matrix A sparse matrix on the host.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(const coordinate_matrix<NumericT, AlignmentV> & gpu_matrix,
+ std::vector< std::map<unsigned int, NumericT> > & cpu_matrix)
+{
+ if (cpu_matrix.size() == 0)
+ cpu_matrix.resize(gpu_matrix.size1());
+
+ assert(cpu_matrix.size() == gpu_matrix.size1() && bool("Matrix dimension mismatch!"));
+
+ tools::sparse_matrix_adapter<NumericT> temp(cpu_matrix, gpu_matrix.size1(), gpu_matrix.size2());
+ copy(gpu_matrix, temp);
+}
+
+
+//////////////////////// coordinate_matrix //////////////////////////
+/** @brief A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row and column indices and val denotes the entry.
+ *
+ * The present implementation of coordinate_matrix suffers from poor runtime efficiency. Users are adviced to use compressed_matrix in the meanwhile.
+ *
+ * @tparam NumericT The floating point type (either float or double, checked at compile time)
+ * @tparam AlignmentV The internal memory size for the arrays, given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two.
+ */
+template<class NumericT, unsigned int AlignmentV /* see forwards.h */ >
+class coordinate_matrix
+{
+public:
+ typedef viennacl::backend::mem_handle handle_type;
+ typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType> value_type;
+ typedef vcl_size_t size_type;
+
+ /** @brief Default construction of a coordinate matrix. No memory is allocated */
+ coordinate_matrix() : rows_(0), cols_(0), nonzeros_(0), group_num_(64) {}
+
+ explicit coordinate_matrix(viennacl::context ctx) : rows_(0), cols_(0), nonzeros_(0), group_num_(64)
+ {
+ group_boundaries_.switch_active_handle_id(ctx.memory_type());
+ coord_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ group_boundaries_.opencl_handle().context(ctx.opencl_context());
+ coord_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ }
+
+ /** @brief Construction of a coordinate matrix with the supplied number of rows and columns. If the number of nonzeros is positive, memory is allocated
+ *
+ * @param rows Number of rows
+ * @param cols Number of columns
+ * @param nonzeros Optional number of nonzeros for memory preallocation
+ * @param ctx Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+ coordinate_matrix(vcl_size_t rows, vcl_size_t cols, vcl_size_t nonzeros = 0, viennacl::context ctx = viennacl::context()) :
+ rows_(rows), cols_(cols), nonzeros_(nonzeros)
+ {
+ if (nonzeros > 0)
+ {
+ viennacl::backend::memory_create(group_boundaries_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * (group_num_ + 1), ctx);
+ viennacl::backend::memory_create(coord_buffer_, viennacl::backend::typesafe_host_array<unsigned int>().element_size() * 2 * internal_nnz(), ctx);
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * internal_nnz(), ctx);
+ }
+ else
+ {
+ group_boundaries_.switch_active_handle_id(ctx.memory_type());
+ coord_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ group_boundaries_.opencl_handle().context(ctx.opencl_context());
+ coord_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ }
+ }
+
+ /** @brief Construction of a coordinate matrix with the supplied number of rows and columns in the supplied context. Does not yet allocate memory.
+ *
+ * @param rows Number of rows
+ * @param cols Number of columns
+ * @param ctx Context in which to create the matrix
+ */
+ explicit coordinate_matrix(vcl_size_t rows, vcl_size_t cols, viennacl::context ctx)
+ : rows_(rows), cols_(cols), nonzeros_(0)
+ {
+ group_boundaries_.switch_active_handle_id(ctx.memory_type());
+ coord_buffer_.switch_active_handle_id(ctx.memory_type());
+ elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ group_boundaries_.opencl_handle().context(ctx.opencl_context());
+ coord_buffer_.opencl_handle().context(ctx.opencl_context());
+ elements_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ }
+
+
+ /** @brief Allocate memory for the supplied number of nonzeros in the matrix. Old values are preserved. */
+ void reserve(vcl_size_t new_nonzeros)
+ {
+ if (new_nonzeros > nonzeros_) //TODO: Do we need to initialize new memory with zero?
+ {
+ handle_type coord_buffer_old;
+ handle_type elements_old;
+ viennacl::backend::memory_shallow_copy(coord_buffer_, coord_buffer_old);
+ viennacl::backend::memory_shallow_copy(elements_, elements_old);
+
+ vcl_size_t internal_new_nnz = viennacl::tools::align_to_multiple<vcl_size_t>(new_nonzeros, AlignmentV);
+ viennacl::backend::typesafe_host_array<unsigned int> size_deducer(coord_buffer_);
+ viennacl::backend::memory_create(coord_buffer_, size_deducer.element_size() * 2 * internal_new_nnz, viennacl::traits::context(coord_buffer_));
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * internal_new_nnz, viennacl::traits::context(elements_));
+
+ viennacl::backend::memory_copy(coord_buffer_old, coord_buffer_, 0, 0, size_deducer.element_size() * 2 * nonzeros_);
+ viennacl::backend::memory_copy(elements_old, elements_, 0, 0, sizeof(NumericT) * nonzeros_);
+
+ nonzeros_ = new_nonzeros;
+ }
+ }
+
+ /** @brief Resize the matrix.
+ *
+ * @param new_size1 New number of rows
+ * @param new_size2 New number of columns
+ * @param preserve If true, the old values are preserved. At present, old values are always discarded.
+ */
+ void resize(vcl_size_t new_size1, vcl_size_t new_size2, bool preserve = true)
+ {
+ assert (new_size1 > 0 && new_size2 > 0);
+
+ if (new_size1 < rows_ || new_size2 < cols_) //enlarge buffer
+ {
+ std::vector<std::map<unsigned int, NumericT> > stl_sparse_matrix;
+ if (rows_ > 0)
+ stl_sparse_matrix.resize(rows_);
+
+ if (preserve && rows_ > 0)
+ viennacl::copy(*this, stl_sparse_matrix);
+
+ stl_sparse_matrix.resize(new_size1);
+
+ //std::cout << "Cropping STL matrix of size " << stl_sparse_matrix.size() << std::endl;
+ if (new_size2 < cols_ && rows_ > 0)
+ {
+ for (vcl_size_t i=0; i<stl_sparse_matrix.size(); ++i)
+ {
+ std::list<unsigned int> to_delete;
+ for (typename std::map<unsigned int, NumericT>::iterator it = stl_sparse_matrix[i].begin();
+ it != stl_sparse_matrix[i].end();
+ ++it)
+ {
+ if (it->first >= new_size2)
+ to_delete.push_back(it->first);
+ }
+
+ for (std::list<unsigned int>::iterator it = to_delete.begin(); it != to_delete.end(); ++it)
+ stl_sparse_matrix[i].erase(*it);
+ }
+ //std::cout << "Cropping done..." << std::endl;
+ }
+
+ rows_ = new_size1;
+ cols_ = new_size2;
+ viennacl::copy(stl_sparse_matrix, *this);
+ }
+
+ rows_ = new_size1;
+ cols_ = new_size2;
+ }
+
+ /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+ void clear()
+ {
+ viennacl::backend::typesafe_host_array<unsigned int> host_group_buffer(group_boundaries_, 65);
+ viennacl::backend::typesafe_host_array<unsigned int> host_coord_buffer(coord_buffer_, 2);
+ std::vector<NumericT> host_elements(1);
+
+ viennacl::backend::memory_create(group_boundaries_, host_group_buffer.element_size() * 65, viennacl::traits::context(group_boundaries_), host_group_buffer.get());
+ viennacl::backend::memory_create(coord_buffer_, host_coord_buffer.element_size() * 2, viennacl::traits::context(coord_buffer_), host_coord_buffer.get());
+ viennacl::backend::memory_create(elements_, sizeof(NumericT) * 1, viennacl::traits::context(elements_), &(host_elements[0]));
+
+ nonzeros_ = 0;
+ group_num_ = 64;
+ }
+
+ /** @brief Returns the number of rows */
+ vcl_size_t size1() const { return rows_; }
+ /** @brief Returns the number of columns */
+ vcl_size_t size2() const { return cols_; }
+ /** @brief Returns the number of nonzero entries */
+ vcl_size_t nnz() const { return nonzeros_; }
+ /** @brief Returns the number of internal nonzero entries */
+ vcl_size_t internal_nnz() const { return viennacl::tools::align_to_multiple<vcl_size_t>(nonzeros_, AlignmentV); }
+
+ /** @brief Returns the OpenCL handle to the (row, column) index array */
+ const handle_type & handle12() const { return coord_buffer_; }
+ /** @brief Returns the OpenCL handle to the matrix entry array */
+ const handle_type & handle() const { return elements_; }
+ /** @brief Returns the OpenCL handle to the group start index array */
+ const handle_type & handle3() const { return group_boundaries_; }
+
+ vcl_size_t groups() const { return group_num_; }
+
+#if defined(_MSC_VER) && _MSC_VER < 1500 //Visual Studio 2005 needs special treatment
+ template<typename CPUMatrixT>
+ friend void copy(const CPUMatrixT & cpu_matrix, coordinate_matrix & gpu_matrix );
+#else
+ template<typename CPUMatrixT, typename NumericT2, unsigned int AlignmentV2>
+ friend void copy(const CPUMatrixT & cpu_matrix, coordinate_matrix<NumericT2, AlignmentV2> & gpu_matrix );
+#endif
+
+private:
+ /** @brief Copy constructor is by now not available. */
+ coordinate_matrix(coordinate_matrix const &);
+
+ /** @brief Assignment is by now not available. */
+ coordinate_matrix & operator=(coordinate_matrix const &);
+
+
+ vcl_size_t rows_;
+ vcl_size_t cols_;
+ vcl_size_t nonzeros_;
+ vcl_size_t group_num_;
+ handle_type coord_buffer_;
+ handle_type elements_;
+ handle_type group_boundaries_;
+};
+
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+ // x = A * y
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x = A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs = temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x += A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs += temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x -= A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs -= temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+ }
+ };
+
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+ }
+ };
+
+ // x += A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs += temp_result;
+ }
+ };
+
+ // x -= A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const coordinate_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs -= temp_result;
+ }
+ };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp b/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
new file mode 100644
index 0000000..c13ef01
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/detail/matrix_def.hpp
@@ -0,0 +1,270 @@
+#ifndef VIENNACL_DETAIL_MATRIX_DEF_HPP_
+#define VIENNACL_DETAIL_MATRIX_DEF_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/detail/matrix_def.hpp
+ @brief Forward declaration of dense matrix classes
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+/** @brief Base class for representing matrices where the individual entries are not all stored explicitly, e.g. identity_matrix<>
+ *
+ * Examples are identity_matrix, scalar_matrix, and zero_matrix.
+ */
+template<typename NumericT>
+class implicit_matrix_base
+{
+protected:
+ typedef vcl_size_t size_type;
+ implicit_matrix_base(size_type size1, size_type size2, NumericT value, bool diag, viennacl::context ctx) : size1_(size1), size2_(size2), value_(value), diag_(diag), off_diag_(0), ctx_(ctx){ }
+public:
+ typedef NumericT const & const_reference;
+ typedef NumericT cpu_value_type;
+
+ size_type size1() const { return size1_; }
+ size_type size2() const { return size2_; }
+ viennacl::context context() const { return ctx_; }
+ NumericT value() const { return value_; }
+ bool diag() const { return diag_; }
+
+ const_reference operator()(size_type i, size_type j) const
+ {
+ if (diag_) return (i == j) ? value_ : off_diag_;
+ return value_;
+ }
+protected:
+ size_type size1_;
+ size_type size2_;
+ NumericT value_;
+ bool diag_;
+ NumericT off_diag_;
+ viennacl::context ctx_;
+};
+
+//
+// Initializer types
+//
+/** @brief Represents a vector consisting of 1 at a given index and zeros otherwise. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class identity_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+ typedef vcl_size_t size_type;
+ typedef NumericT const & const_reference;
+
+ identity_matrix(size_type s, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s, s, 1, true, ctx){}
+};
+
+
+/** @brief Represents a vector consisting of zeros only. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class zero_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+ typedef vcl_size_t size_type;
+ typedef NumericT const & const_reference;
+
+ zero_matrix(size_type s1, size_type s2, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s1, s2, 0, false, ctx){}
+};
+
+
+/** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+class scalar_matrix : public implicit_matrix_base<NumericT>
+{
+public:
+ typedef vcl_size_t size_type;
+ typedef NumericT const & const_reference;
+
+ scalar_matrix(size_type s1, size_type s2, const_reference val, viennacl::context ctx = viennacl::context()) : implicit_matrix_base<NumericT>(s1, s2, val, false, ctx) {}
+};
+
+template<class NumericT, typename SizeT, typename DistanceT>
+class matrix_base
+{
+ typedef matrix_base<NumericT, SizeT, DistanceT> self_type;
+public:
+
+ typedef matrix_iterator<row_iteration, self_type > iterator1;
+ typedef matrix_iterator<col_iteration, self_type > iterator2;
+ typedef scalar<NumericT> value_type;
+ typedef NumericT cpu_value_type;
+ typedef SizeT size_type;
+ typedef DistanceT difference_type;
+ typedef viennacl::backend::mem_handle handle_type;
+
+ /** @brief The default constructor. Does not allocate any memory. */
+ explicit matrix_base(): size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0), row_major_fixed_(false), row_major_(true) {}
+
+ /** @brief The layout constructor. Does not allocate any memory. */
+ explicit matrix_base(bool is_row_major) : size1_(0), size2_(0), start1_(0), start2_(0), stride1_(1), stride2_(1), internal_size1_(0), internal_size2_(0), row_major_fixed_(true), row_major_(is_row_major) {}
+
+ /** @brief Creates the matrix with the given dimensions
+ *
+ * @param rows Number of rows
+ * @param columns Number of columns
+ * @param is_row_major Boolean flag stating whether this matrix is stored row-major
+ * @param ctx Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+ explicit matrix_base(size_type rows, size_type columns, bool is_row_major, viennacl::context ctx = viennacl::context());
+
+ /** @brief Constructor for creating a matrix_range or matrix_stride from some other matrix/matrix_range/matrix_stride */
+ explicit matrix_base(viennacl::backend::mem_handle & h,
+ size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+ size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+ bool is_row_major): size1_(mat_size1), size2_(mat_size2),
+ start1_(mat_start1), start2_(mat_start2),
+ stride1_(mat_stride1), stride2_(mat_stride2),
+ internal_size1_(mat_internal_size1), internal_size2_(mat_internal_size2),
+ row_major_fixed_(true), row_major_(is_row_major),
+ elements_(h) {}
+
+
+ template<typename LHS, typename RHS, typename OP>
+ explicit matrix_base(matrix_expression<const LHS, const RHS, OP> const & proxy);
+
+ // CUDA or host memory:
+ explicit matrix_base(NumericT * ptr_to_mem, viennacl::memory_types mem_type,
+ size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+ size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+ bool is_row_major);
+
+#ifdef VIENNACL_WITH_OPENCL
+ explicit matrix_base(cl_mem mem, size_type rows, size_type columns, bool is_row_major, viennacl::context ctx = viennacl::context());
+ explicit matrix_base(cl_mem mem, viennacl::context ctx,
+ size_type mat_size1, size_type mat_start1, size_type mat_stride1, size_type mat_internal_size1,
+ size_type mat_size2, size_type mat_start2, size_type mat_stride2, size_type mat_internal_size2,
+ bool is_row_major);
+#endif
+
+ /* Copy CTOR */
+ matrix_base(const self_type & other);
+
+ /* Conversion CTOR */
+ template<typename OtherNumericT>
+ matrix_base(const matrix_base<OtherNumericT, SizeT, DistanceT> & other);
+
+ self_type & operator=(const self_type & other);
+ template<typename OtherNumericT>
+ self_type & operator=(const matrix_base<OtherNumericT, SizeT, DistanceT> & other);
+
+ /** @brief Implementation of the operation m1 = m2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+ * @param proxy An expression template proxy class. */
+ template<typename LHS, typename RHS, typename OP>
+ self_type & operator=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+ // A = trans(B). Currently achieved in CPU memory
+ self_type & operator=(const matrix_expression< const self_type, const self_type, op_trans> & proxy);
+ template<typename LHS, typename RHS, typename OP>
+ self_type & operator+=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+ template<typename LHS, typename RHS, typename OP>
+ self_type & operator-=(const matrix_expression<const LHS, const RHS, OP> & proxy);
+ /** @brief Assigns the supplied identity matrix to the matrix. */
+ self_type & operator = (identity_matrix<NumericT> const & m);
+ /** @brief Assigns the supplied zero matrix to the matrix. */
+ self_type & operator = (zero_matrix<NumericT> const & m);
+ /** @brief Assigns the supplied scalar vector to the matrix. */
+ self_type & operator = (scalar_matrix<NumericT> const & m);
+ //read-write access to an element of the matrix/matrix_range/matrix_slice
+ /** @brief Read-write access to a single element of the matrix/matrix_range/matrix_slice */
+ entry_proxy<NumericT> operator()(size_type row_index, size_type col_index);
+ /** @brief Read access to a single element of the matrix/matrix_range/matrix_slice */
+ const_entry_proxy<NumericT> operator()(size_type row_index, size_type col_index) const;
+ self_type & operator += (const self_type & other);
+ self_type & operator -= (const self_type & other);
+
+ /** @brief Scales the matrix by a char (8-bit integer) */
+ self_type & operator *= (char val);
+ /** @brief Scales the matrix by a short integer */
+ self_type & operator *= (short val);
+ /** @brief Scales the matrix by an integer */
+ self_type & operator *= (int val);
+ /** @brief Scales the matrix by a long integer */
+ self_type & operator *= (long val);
+ /** @brief Scales the matrix by a single precision floating point value */
+ self_type & operator *= (float val);
+ /** @brief Scales the matrix by a double precision floating point value */
+ self_type & operator *= (double val);
+
+ /** @brief Scales the matrix by a char (8-bit integer) */
+ self_type & operator /= (char val);
+ /** @brief Scales the matrix by a short integer */
+ self_type & operator /= (short val);
+ /** @brief Scales the matrix by an integer */
+ self_type & operator /= (int val);
+ /** @brief Scales the matrix by a long integer */
+ self_type & operator /= (long val);
+ /** @brief Scales the matrix by a single precision floating point value */
+ self_type & operator /= (float val);
+ /** @brief Scales the matrix by a double precision floating point value */
+ self_type & operator /= (double val);
+
+ /** @brief Sign flip for the matrix. Emulated to be equivalent to -1.0 * matrix */
+ matrix_expression<const self_type, const NumericT, op_mult> operator-() const;
+ /** @brief Returns the number of rows */
+ size_type size1() const { return size1_;}
+ /** @brief Returns the number of columns */
+ size_type size2() const { return size2_; }
+ /** @brief Returns the number of rows */
+ size_type start1() const { return start1_;}
+ /** @brief Returns the number of columns */
+ size_type start2() const { return start2_; }
+ /** @brief Returns the number of rows */
+ size_type stride1() const { return stride1_;}
+ /** @brief Returns the number of columns */
+ size_type stride2() const { return stride2_; }
+ /** @brief Resets all entries to zero */
+ void clear();
+ /** @brief Returns the internal number of rows. Usually required for launching OpenCL kernels only */
+ size_type internal_size1() const { return internal_size1_; }
+ /** @brief Returns the internal number of columns. Usually required for launching OpenCL kernels only */
+ size_type internal_size2() const { return internal_size2_; }
+ /** @brief Returns the total amount of allocated memory in multiples of sizeof(NumericT) */
+ size_type internal_size() const { return internal_size1() * internal_size2(); }
+ /** @brief Returns the OpenCL handle, non-const-version */
+ handle_type & handle() { return elements_; }
+ /** @brief Returns the OpenCL handle, const-version */
+ const handle_type & handle() const { return elements_; }
+ viennacl::memory_types memory_domain() const { return elements_.get_active_handle_id(); }
+ bool row_major() const { return row_major_; }
+ void switch_memory_context(viennacl::context new_ctx) { viennacl::backend::switch_memory_context<NumericT>(elements_, new_ctx); }
+
+protected:
+ void set_handle(viennacl::backend::mem_handle const & h);
+ void resize(size_type rows, size_type columns, bool preserve = true);
+private:
+ size_type size1_;
+ size_type size2_;
+ size_type start1_;
+ size_type start2_;
+ size_type stride1_;
+ size_type stride2_;
+ size_type internal_size1_;
+ size_type internal_size2_;
+ bool row_major_fixed_; //helper flag to make layout of matrix<T, row_major> A; persistent
+ bool row_major_;
+ handle_type elements_;
+}; //matrix
+
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp b/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
new file mode 100644
index 0000000..4624b76
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/detail/vector_def.hpp
@@ -0,0 +1,349 @@
+#ifndef VIENNACL_DETAIL_VECTOR_DEF_HPP_
+#define VIENNACL_DETAIL_VECTOR_DEF_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/detail/vector_def.hpp
+ @brief Forward declarations of the implicit_vector_base, vector_base class.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/entry_proxy.hpp"
+
+namespace viennacl
+{
+
+/** @brief Common base class for representing vectors where the entries are not all stored explicitly.
+ *
+ * Typical examples are zero_vector or scalar_vector.
+ */
+template<typename NumericT>
+class implicit_vector_base
+{
+protected:
+ implicit_vector_base(vcl_size_t s, vcl_size_t i, NumericT v, viennacl::context ctx) : size_(s), index_(std::make_pair(true,i)), value_(v), ctx_(ctx){ }
+ implicit_vector_base(vcl_size_t s, NumericT v, viennacl::context ctx) : size_(s), index_(std::make_pair(false,0)), value_(v), ctx_(ctx){ }
+
+public:
+ typedef NumericT const & const_reference;
+ typedef NumericT cpu_value_type;
+
+ viennacl::context context() const { return ctx_; }
+ vcl_size_t size() const { return size_; }
+ cpu_value_type value() const { return value_; }
+ vcl_size_t index() const { return index_.second; }
+ bool has_index() const { return index_.first; }
+
+ cpu_value_type operator()(vcl_size_t i) const
+ {
+ if (index_.first)
+ return (i==index_.second)?value_:0;
+ return value_;
+ }
+
+ cpu_value_type operator[](vcl_size_t i) const
+ {
+ if (index_.first)
+ return (i==index_.second)?value_:0;
+ return
+ value_;
+ }
+
+protected:
+ vcl_size_t size_;
+ std::pair<bool, vcl_size_t> index_;
+ NumericT value_;
+ viennacl::context ctx_;
+};
+
+/** @brief Represents a vector consisting of 1 at a given index and zeros otherwise.*/
+template<typename NumericT>
+struct unit_vector : public implicit_vector_base<NumericT>
+{
+ unit_vector(vcl_size_t s, vcl_size_t ind, viennacl::context ctx = viennacl::context()) : implicit_vector_base<NumericT>(s, ind, 1, ctx)
+ {
+ assert( (ind < s) && bool("Provided index out of range!") );
+ }
+};
+
+
+/** @brief Represents a vector consisting of scalars 's' only, i.e. v[i] = s for all i. To be used as an initializer for viennacl::vector, vector_range, or vector_slize only. */
+template<typename NumericT>
+struct scalar_vector : public implicit_vector_base<NumericT>
+{
+ scalar_vector(vcl_size_t s, NumericT val, viennacl::context ctx = viennacl::context()) : implicit_vector_base<NumericT>(s, val, ctx) {}
+};
+
+template<typename NumericT>
+struct zero_vector : public scalar_vector<NumericT>
+{
+ zero_vector(vcl_size_t s, viennacl::context ctx = viennacl::context()) : scalar_vector<NumericT>(s, 0, ctx){}
+};
+
+
+/** @brief Common base class for dense vectors, vector ranges, and vector slices.
+ *
+ * @tparam NumericT The floating point type, either 'float' or 'double'
+ */
+template<class NumericT, typename SizeT /* see forwards.h for default type */, typename DistanceT /* see forwards.h for default type */>
+class vector_base
+{
+ typedef vector_base<NumericT, SizeT, DistanceT> self_type;
+
+public:
+ typedef scalar<NumericT> value_type;
+ typedef NumericT cpu_value_type;
+ typedef viennacl::backend::mem_handle handle_type;
+ typedef SizeT size_type;
+ typedef DistanceT difference_type;
+ typedef const_vector_iterator<NumericT, 1> const_iterator;
+ typedef vector_iterator<NumericT, 1> iterator;
+
+ /** @brief Returns the length of the vector (cf. std::vector) */
+ size_type size() const { return size_; }
+ /** @brief Returns the internal length of the vector, which is given by size() plus the extra memory due to padding the memory with zeros up to a multiple of 'AlignmentV' */
+ size_type internal_size() const { return internal_size_; }
+ /** @brief Returns the offset within the buffer */
+ size_type start() const { return start_; }
+ /** @brief Returns the stride within the buffer (in multiples of sizeof(NumericT)) */
+ size_type stride() const { return stride_; }
+ /** @brief Returns true is the size is zero */
+ bool empty() const { return size_ == 0; }
+ /** @brief Returns the memory handle. */
+ const handle_type & handle() const { return elements_; }
+ /** @brief Returns the memory handle. */
+ handle_type & handle() { return elements_; }
+ viennacl::memory_types memory_domain() const { return elements_.get_active_handle_id(); }
+
+ /** @brief Default constructor in order to be compatible with various containers.
+ */
+ explicit vector_base();
+
+ /** @brief An explicit constructor for wrapping an existing vector into a vector_range or vector_slice.
+ *
+ * @param h The existing memory handle from a vector/vector_range/vector_slice
+ * @param vec_size The length (i.e. size) of the buffer
+ * @param vec_start The offset from the beginning of the buffer identified by 'h'
+ * @param vec_stride Increment between two elements in the original buffer (in multiples of NumericT)
+ */
+ explicit vector_base(viennacl::backend::mem_handle & h, size_type vec_size, size_type vec_start, size_type vec_stride);
+
+ /** @brief Creates a vector and allocates the necessary memory */
+ explicit vector_base(size_type vec_size, viennacl::context ctx = viennacl::context());
+
+ // CUDA or host memory:
+ explicit vector_base(NumericT * ptr_to_mem, viennacl::memory_types mem_type, size_type vec_size, vcl_size_t start = 0, size_type stride = 1);
+
+#ifdef VIENNACL_WITH_OPENCL
+ /** @brief Create a vector from existing OpenCL memory
+ *
+ * Note: The provided memory must take an eventual AlignmentV into account, i.e. existing_mem must be at least of size internal_size()!
+ * This is trivially the case with the default alignment, but should be considered when using vector<> with an alignment parameter not equal to 1.
+ *
+ * @param existing_mem An OpenCL handle representing the memory
+ * @param vec_size The size of the vector.
+ */
+ explicit vector_base(cl_mem existing_mem, size_type vec_size, size_type start = 0, size_type stride = 1, viennacl::context ctx = viennacl::context());
+#endif
+
+ template<typename LHS, typename RHS, typename OP>
+ explicit vector_base(vector_expression<const LHS, const RHS, OP> const & proxy);
+
+ // Copy CTOR:
+ vector_base(const self_type & other);
+
+ // Conversion CTOR:
+ template<typename OtherNumericT>
+ vector_base(const vector_base<OtherNumericT> & v1);
+
+ /** @brief Assignment operator. Other vector needs to be of the same size, or this vector is not yet initialized.
+ */
+ self_type & operator=(const self_type & vec);
+ /** @brief Implementation of the operation v1 = v2 @ alpha, where @ denotes either multiplication or division, and alpha is either a CPU or a GPU scalar
+ * @param proxy An expression template proxy class.
+ */
+ template<typename LHS, typename RHS, typename OP>
+ self_type & operator=(const vector_expression<const LHS, const RHS, OP> & proxy);
+ /** @brief Converts a vector of a different numeric type to the current numeric type */
+ template<typename OtherNumericT>
+ self_type & operator = (const vector_base<OtherNumericT> & v1);
+ /** @brief Creates the vector from the supplied unit vector. */
+ self_type & operator = (unit_vector<NumericT> const & v);
+ /** @brief Creates the vector from the supplied zero vector. */
+ self_type & operator = (zero_vector<NumericT> const & v);
+ /** @brief Creates the vector from the supplied scalar vector. */
+ self_type & operator = (scalar_vector<NumericT> const & v);
+
+
+ ///////////////////////////// Matrix Vector interaction start ///////////////////////////////////
+ /** @brief Operator overload for v1 = A * v2, where v1, v2 are vectors and A is a dense matrix.
+ * @param proxy An expression template proxy class
+ */
+ self_type & operator=(const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy);
+
+ //transposed_matrix_proxy:
+ /** @brief Operator overload for v1 = trans(A) * v2, where v1, v2 are vectors and A is a dense matrix.
+ * @param proxy An expression template proxy class
+ */
+ self_type & operator=(const vector_expression< const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans >,
+ const vector_base<NumericT>,
+ op_prod> & proxy);
+
+ ///////////////////////////// Matrix Vector interaction end ///////////////////////////////////
+
+
+ //read-write access to an element of the vector
+ /** @brief Read-write access to a single element of the vector */
+ entry_proxy<NumericT> operator()(size_type index);
+ /** @brief Read-write access to a single element of the vector */
+ entry_proxy<NumericT> operator[](size_type index);
+ /** @brief Read access to a single element of the vector */
+ const_entry_proxy<NumericT> operator()(size_type index) const;
+ /** @brief Read access to a single element of the vector */
+ const_entry_proxy<NumericT> operator[](size_type index) const;
+ self_type & operator += (const self_type & vec);
+ self_type & operator -= (const self_type & vec);
+
+ /** @brief Scales a vector (or proxy) by a char (8-bit integer) */
+ self_type & operator *= (char val);
+ /** @brief Scales a vector (or proxy) by a short integer */
+ self_type & operator *= (short val);
+ /** @brief Scales a vector (or proxy) by an integer */
+ self_type & operator *= (int val);
+ /** @brief Scales a vector (or proxy) by a long integer */
+ self_type & operator *= (long val);
+ /** @brief Scales a vector (or proxy) by a single precision floating point value */
+ self_type & operator *= (float val);
+ /** @brief Scales a vector (or proxy) by a double precision floating point value */
+ self_type & operator *= (double val);
+
+
+ /** @brief Scales a vector (or proxy) by a char (8-bit integer) */
+ self_type & operator /= (char val);
+ /** @brief Scales a vector (or proxy) by a short integer */
+ self_type & operator /= (short val);
+ /** @brief Scales a vector (or proxy) by an integer */
+ self_type & operator /= (int val);
+ /** @brief Scales a vector (or proxy) by a long integer */
+ self_type & operator /= (long val);
+ /** @brief Scales a vector (or proxy) by a single precision floating point value */
+ self_type & operator /= (float val);
+ /** @brief Scales a vector (or proxy) by a double precision floating point value */
+ self_type & operator /= (double val);
+
+ /** @brief Scales the vector by a char (8-bit integer) 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_mult>
+ operator * (char value) const;
+ /** @brief Scales the vector by a short integer 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_mult>
+ operator * (short value) const;
+ /** @brief Scales the vector by an integer 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_mult>
+ operator * (int value) const;
+ /** @brief Scales the vector by a long integer 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_mult>
+ operator * (long value) const;
+ /** @brief Scales the vector by a single precision floating point value 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_mult>
+ operator * (float value) const;
+ /** @brief Scales the vector by a double precision floating point value 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_mult>
+ operator * (double value) const;
+
+ /** @brief Scales the vector by a char (8-bit integer) 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_div>
+ operator / (char value) const;
+ /** @brief Scales the vector by a short integer 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_div>
+ operator / (short value) const;
+ /** @brief Scales the vector by an integer 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_div>
+ operator / (int value) const;
+ /** @brief Scales the vector by a long integer 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_div>
+ operator / (long value) const;
+ /** @brief Scales the vector by a single precision floating point value 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_div>
+ operator / (float value) const;
+ /** @brief Scales the vector by a double precision floating point value 'alpha' and returns an expression template */
+ vector_expression< const self_type, const NumericT, op_div>
+ operator / (double value) const;
+
+ /** @brief Sign flip for the vector. Emulated to be equivalent to -1.0 * vector */
+ vector_expression<const self_type, const NumericT, op_mult> operator-() const;
+ /** @brief Returns an iterator pointing to the beginning of the vector (STL like)*/
+ iterator begin();
+ /** @brief Returns an iterator pointing to the end of the vector (STL like)*/
+ iterator end();
+ /** @brief Returns a const-iterator pointing to the beginning of the vector (STL like)*/
+ const_iterator begin() const;
+ /** @brief Returns a const-iterator pointing to the end of the vector (STL like)*/
+ const_iterator end() const;
+ /** @brief Swaps the entries of the two vectors */
+ self_type & swap(self_type & other);
+
+ /** @brief Resets all entries to zero. Does not change the size of the vector. */
+ void clear();
+
+protected:
+
+ void set_handle(viennacl::backend::mem_handle const & h) { elements_ = h; }
+
+ /** @brief Swaps the handles of two vectors by swapping the OpenCL handles only, no data copy */
+ self_type & fast_swap(self_type & other);
+
+ /** @brief Pads vectors with alignment > 1 with trailing zeros if the internal size is larger than the visible size */
+ void pad();
+
+ void switch_memory_context(viennacl::context new_ctx);
+
+ //TODO: Think about implementing the following public member functions
+ //void insert_element(unsigned int i, NumericT val){}
+ //void erase_element(unsigned int i){}
+
+ //enlarge or reduce allocated memory and set unused memory to zero
+ /** @brief Resizes the allocated memory for the vector. Pads the memory to be a multiple of 'AlignmentV'
+ *
+ * @param new_size The new size of the vector
+ * @param preserve If true, old entries of the vector are preserved, otherwise eventually discarded.
+ */
+ void resize(size_type new_size, bool preserve = true);
+
+ /** @brief Resizes the allocated memory for the vector. Convenience function for setting an OpenCL context in case reallocation is needed
+ *
+ * @param new_size The new size of the vector
+ * @param ctx The context within which the new memory should be allocated
+ * @param preserve If true, old entries of the vector are preserved, otherwise eventually discarded.
+ */
+ void resize(size_type new_size, viennacl::context ctx, bool preserve = true);
+private:
+
+ void resize_impl(size_type new_size, viennacl::context ctx, bool preserve = true);
+
+ size_type size_;
+ size_type start_;
+ size_type stride_;
+ size_type internal_size_;
+ handle_type elements_;
+}; //vector_base
+
+/** \endcond */
+
+} // namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
new file mode 100644
index 0000000..3b6ec76
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/common.hpp
@@ -0,0 +1,219 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_COMMON_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_COMMON_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/device_specific/builtin_database/common.hpp
+*
+* Common routines such as device lookup for the built-in device database.
+*/
+
+#include "viennacl/ocl/device_utils.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/forwards.h"
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace builtin_database
+{
+
+using scheduler::FLOAT_TYPE;
+using scheduler::DOUBLE_TYPE;
+using namespace viennacl::ocl;
+
+template<class ParamT>
+class database_type
+{
+public:
+
+ //Because it would be too easy to use nested maps directly.
+ //THANKS, VISUAL STUDIO.
+ struct expression_t{ typedef std::map<scheduler::statement_node_numeric_type, ParamT> map_t; map_t d; };
+ struct device_name_t{ typedef std::map<device_name_type, expression_t> map_t; map_t d; };
+ struct device_architecture_t{ typedef std::map<ocl::device_architecture_family, device_name_t> map_t; map_t d; };
+ struct device_type_t{ typedef std::map<device_type, device_architecture_t> map_t; map_t d; };
+ struct type{ typedef std::map<vendor_id_type, device_type_t> map_t; map_t d; };
+ type map;
+
+ database_type<ParamT> & operator()(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, scheduler::statement_node_numeric_type p4, ParamT const & p5)
+ {
+ map.d[p0].d[p1].d[p2].d[p3].d.insert(std::make_pair(p4, p5));
+ return *this;
+ }
+
+ database_type<ParamT> & add_1B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+ {
+ return (*this)(p0, p1, p2, p3, scheduler::CHAR_TYPE, p5)
+ (p0, p1, p2, p3, scheduler::UCHAR_TYPE, p5);
+ }
+
+ database_type<ParamT> & add_2B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+ {
+ return (*this)(p0, p1, p2, p3, scheduler::SHORT_TYPE, p5)
+ (p0, p1, p2, p3, scheduler::USHORT_TYPE, p5)
+ (p0, p1, p2, p3, scheduler::HALF_TYPE, p5);
+ }
+
+ database_type<ParamT> & add_4B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+ {
+ return (*this)(p0, p1, p2, p3, scheduler::INT_TYPE, p5)
+ (p0, p1, p2, p3, scheduler::UINT_TYPE, p5)
+ (p0, p1, p2, p3, scheduler::FLOAT_TYPE, p5);
+ }
+
+ database_type<ParamT> & add_8B(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, ParamT const & p5)
+ {
+ return (*this)(p0, p1, p2, p3, scheduler::LONG_TYPE, p5)
+ (p0, p1, p2, p3, scheduler::ULONG_TYPE, p5)
+ (p0, p1, p2, p3, scheduler::DOUBLE_TYPE, p5);
+ }
+
+ ParamT const & at(vendor_id_type p0, device_type p1, ocl::device_architecture_family p2, device_name_type p3, scheduler::statement_node_numeric_type p4) const
+ {
+ return viennacl::device_specific::at(
+ viennacl::device_specific::at(
+ viennacl::device_specific::at(
+ viennacl::device_specific::at(
+ viennacl::device_specific::at(map.d, p0).d,
+ p1).d,
+ p2).d,
+ p3).d,
+ p4);
+ }
+
+
+};
+
+
+template<typename StringT>
+StringT get_mapped_device_name(StringT const & device_name, vendor_id_type vendor_id)
+{
+ if (vendor_id == viennacl::ocl::nvidia_id)
+ {
+ vcl_size_t found=0;
+ if ((found = device_name.find("GeForce",0)) != std::string::npos)
+ {
+ if ((found = device_name.find_first_of("123456789", found)) != std::string::npos)
+ {
+ switch (device_name[found]) // GeForce 400 series mapped to GTX 470, GeForce 500 series mapped to GTX 580:
+ {
+ case '4' : return "GeForce GTX 470";
+ case '5' : return "GeForce GTX 570";
+ default: break; // since there is only one Kepler and one Maxwell device in the database, fallback works properly
+ }
+ }
+ }
+ else if ((found = device_name.find("Tesla",0)) != std::string::npos) // map Kepler-based Teslas to K20m
+ {
+ if (device_name.find("Tesla C10",0) != std::string::npos)
+ return "Tesla C2050";
+ else if (device_name.find("Tesla S10",0) != std::string::npos)
+ return "Tesla C2050";
+ else if (device_name.find("Tesla M20",0) != std::string::npos)
+ return "Tesla C2050";
+ else if (device_name.find("Tesla S20",0) != std::string::npos)
+ return "Tesla C2050";
+ else if (device_name.find("Tesla K",0) != std::string::npos) // all Kepler-based Teslas
+ return "Tesla K20m";
+ }
+ }
+
+ return device_name;
+}
+
+/** @brief Get the profile for a device and a descriptor
+*
+* There are built-in defaults for CPUs, Accelerators, GPUs.
+*/
+template<class NumericT, class ParamT>
+inline ParamT const & get_parameters(database_type<ParamT> const & database, viennacl::ocl::device const & device)
+{
+ scheduler::statement_node_numeric_type numeric_type = scheduler::statement_node_numeric_type(scheduler::result_of::numeric_type_id<NumericT>::value);
+
+ device_type dev_type = device.type() & device_type(0xFE); // chop off 'default' characterization
+ vendor_id_type vendor_id = device.vendor_id();
+ ocl::device_architecture_family device_architecture = device.architecture_family();
+ std::string const & device_name = device.name();
+
+
+ /*-Vendor ID-*/
+ // std::cout << "Looking up vendor ID..." << std::endl;
+ typename database_type<ParamT>::type::map_t::const_iterator vendor_it = database.map.d.find(vendor_id);
+ //Vendor not recognized => device type default
+ if (vendor_it==database.map.d.end())
+ return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+ /*-Device Type-*/
+ // std::cout << "Looking up device type..." << std::endl;
+ typename database_type<ParamT>::device_type_t::map_t::const_iterator device_type_it = vendor_it->second.d.find(dev_type);
+ //Device type not recognized for this vendor => device type default
+ if (device_type_it==vendor_it->second.d.end())
+ return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+ /*-Device Architecture-*/
+ // std::cout << "Looking up device architecture..." << std::endl;
+ typename database_type<ParamT>::device_architecture_t::map_t::const_iterator architecture_it = device_type_it->second.d.find(device_architecture);
+ //Architecture not found. We try to find the closest architecture available.
+ if (architecture_it==device_type_it->second.d.end())
+ {
+ typename database_type<ParamT>::device_architecture_t::map_t::const_iterator current_it = device_type_it->second.d.begin();
+ architecture_it = current_it;
+ int closest_arch = current_it->first - device_architecture;
+ while (current_it!=device_type_it->second.d.end())
+ {
+ int arch_diff = std::abs(static_cast<int>(current_it->first) - static_cast<int>(device_architecture));
+ if (arch_diff < closest_arch)
+ {
+ architecture_it = current_it;
+ closest_arch = arch_diff;
+ }
+ current_it++;
+ }
+ }
+
+ /*-Device Name-*/
+ std::string mapped_device_name = get_mapped_device_name(device_name, device.vendor_id());
+
+ typename database_type<ParamT>::device_name_t::map_t::const_iterator device_name_it = architecture_it->second.d.find(mapped_device_name);
+ //Name not found. We just take the first device for the architecture
+ if (device_name_it==architecture_it->second.d.end())
+ {
+ device_name_it = architecture_it->second.d.begin();
+ }
+
+ // std::cout << "Looking up expression name.." << std::endl;
+ /*-Expression-*/
+ typename database_type<ParamT>::expression_t::map_t::const_iterator expression_it = device_name_it->second.d.find(numeric_type);
+ //Expression not found => Vendor default
+ if (expression_it==device_name_it->second.d.end())
+ return database.at(ocl::unknown_id, dev_type, ocl::unknown, "", numeric_type);
+
+ // std::cout << "Device found in the database! Getting profile..." << std::endl;
+ //Everything okay. Return specific profile//
+ return expression_it->second;
+}
+
+
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
new file mode 100644
index 0000000..5eede89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_ACCELERATOR_FALLBACK_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_ACCELERATOR_FALLBACK_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace accelerator{
+namespace fallback{
+
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_ACCELERATOR, unknown, "", matrix_product_template::parameters_type(1,16,32,16,1,1,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
new file mode 100644
index 0000000..ffaa9db
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_CPU_FALLBACK_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_CPU_FALLBACK_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace cpu{
+namespace fallback{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_CPU, unknown, "", matrix_product_template::parameters_type(1,8,8,1,4,4,4,FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
new file mode 100644
index 0000000..b0e3a1c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_CEDAR_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_CEDAR_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace evergreen{
+namespace cedar{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cedar", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
new file mode 100644
index 0000000..d1179b8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp
@@ -0,0 +1,65 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_EVERGREEN_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace evergreen{
+namespace cypress{
+
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(4,32,4,8,4,1,4,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::evergreen, "Cypress", matrix_product_template::parameters_type(1,8,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
new file mode 100644
index 0000000..2805a5c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_BARTS_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_BARTS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace barts{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,2,2,128,2,2,1,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,8,8,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,4,32));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,2,1,64,2,1,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Barts", matrix_product_template::parameters_type(1,8,8,8,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
new file mode 100644
index 0000000..018839e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_DEVASTATOR_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_DEVASTATOR_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace devastator{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,8,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,16,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(2,64,16,4,2,1,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Devastator", matrix_product_template::parameters_type(1,16,16,8,1,2,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
new file mode 100644
index 0000000..9e1db25
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp
@@ -0,0 +1,64 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_SCRAPPER_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_NORTHERN_ISLANDS_SCRAPPER_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace northern_islands{
+namespace scrapper{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,8,16,32,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,8,16,8,2,2,1,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(2,32,2,4,2,1,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::northern_islands, "Scrapper", matrix_product_template::parameters_type(1,16,16,8,2,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,16));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
[23/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
new file mode 100644
index 0000000..fb89742
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
@@ -0,0 +1,738 @@
+#ifndef VIENNACL_GMRES_HPP_
+#define VIENNACL_GMRES_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/gmres.hpp
+ @brief Implementations of the generalized minimum residual method are in this file.
+*/
+
+#include <vector>
+#include <cmath>
+#include <limits>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+#include "viennacl/linalg/iterative_operations.hpp"
+#include "viennacl/vector_proxy.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the solver GMRES. Used for supplying solver parameters and for dispatching the solve() function
+*/
+class gmres_tag //generalized minimum residual
+{
+public:
+ /** @brief The constructor
+ *
+ * @param tol Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+ * @param max_iterations The maximum number of iterations (including restarts
+ * @param krylov_dim The maximum dimension of the Krylov space before restart (number of restarts is found by max_iterations / krylov_dim)
+ */
+ gmres_tag(double tol = 1e-10, unsigned int max_iterations = 300, unsigned int krylov_dim = 20)
+ : tol_(tol), abs_tol_(0), iterations_(max_iterations), krylov_dim_(krylov_dim), iters_taken_(0) {}
+
+ /** @brief Returns the relative tolerance */
+ double tolerance() const { return tol_; }
+
+ /** @brief Returns the absolute tolerance */
+ double abs_tolerance() const { return abs_tol_; }
+ /** @brief Sets the absolute tolerance */
+ void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
+
+ /** @brief Returns the maximum number of iterations */
+ unsigned int max_iterations() const { return iterations_; }
+ /** @brief Returns the maximum dimension of the Krylov space before restart */
+ unsigned int krylov_dim() const { return krylov_dim_; }
+ /** @brief Returns the maximum number of GMRES restarts */
+ unsigned int max_restarts() const
+ {
+ unsigned int ret = iterations_ / krylov_dim_;
+ if (ret > 0 && (ret * krylov_dim_ == iterations_) )
+ return ret - 1;
+ return ret;
+ }
+
+ /** @brief Return the number of solver iterations: */
+ unsigned int iters() const { return iters_taken_; }
+ /** @brief Set the number of solver iterations (should only be modified by the solver) */
+ void iters(unsigned int i) const { iters_taken_ = i; }
+
+ /** @brief Returns the estimated relative error at the end of the solver run */
+ double error() const { return last_error_; }
+ /** @brief Sets the estimated relative error at the end of the solver run */
+ void error(double e) const { last_error_ = e; }
+
+private:
+ double tol_;
+ double abs_tol_;
+ unsigned int iterations_;
+ unsigned int krylov_dim_;
+
+ //return values from solver
+ mutable unsigned int iters_taken_;
+ mutable double last_error_;
+};
+
+namespace detail
+{
+
+ template<typename SrcVectorT, typename DestVectorT>
+ void gmres_copy_helper(SrcVectorT const & src, DestVectorT & dest, vcl_size_t len, vcl_size_t start = 0)
+ {
+ for (vcl_size_t i=0; i<len; ++i)
+ dest[start+i] = src[start+i];
+ }
+
+ template<typename NumericT, typename DestVectorT>
+ void gmres_copy_helper(viennacl::vector<NumericT> const & src, DestVectorT & dest, vcl_size_t len, vcl_size_t start = 0)
+ {
+ typedef typename viennacl::vector<NumericT>::difference_type difference_type;
+ viennacl::copy( src.begin() + static_cast<difference_type>(start),
+ src.begin() + static_cast<difference_type>(start + len),
+ dest.begin() + static_cast<difference_type>(start));
+ }
+
+ /** @brief Computes the householder vector 'hh_vec' which rotates 'input_vec' such that all entries below the j-th entry of 'v' become zero.
+ *
+ * @param input_vec The input vector
+ * @param hh_vec The householder vector defining the relection (I - beta * hh_vec * hh_vec^T)
+ * @param beta The coefficient beta in (I - beta * hh_vec * hh_vec^T)
+ * @param mu The norm of the input vector part relevant for the reflection: norm_2(input_vec[j:size])
+ * @param j Index of the last nonzero index in 'input_vec' after applying the reflection
+ */
+ template<typename VectorT, typename NumericT>
+ void gmres_setup_householder_vector(VectorT const & input_vec, VectorT & hh_vec, NumericT & beta, NumericT & mu, vcl_size_t j)
+ {
+ NumericT input_j = input_vec(j);
+
+ // copy entries from input vector to householder vector:
+ detail::gmres_copy_helper(input_vec, hh_vec, viennacl::traits::size(hh_vec) - (j+1), j+1);
+
+ NumericT sigma = viennacl::linalg::norm_2(hh_vec);
+ sigma *= sigma;
+
+ if (sigma <= 0)
+ {
+ beta = 0;
+ mu = input_j;
+ }
+ else
+ {
+ mu = std::sqrt(sigma + input_j*input_j);
+
+ NumericT hh_vec_0 = (input_j <= 0) ? (input_j - mu) : (-sigma / (input_j + mu));
+
+ beta = NumericT(2) * hh_vec_0 * hh_vec_0 / (sigma + hh_vec_0 * hh_vec_0);
+
+ //divide hh_vec by its diagonal element hh_vec_0
+ hh_vec /= hh_vec_0;
+ hh_vec[j] = NumericT(1);
+ }
+ }
+
+ // Apply (I - beta h h^T) to x (Householder reflection with Householder vector h)
+ template<typename VectorT, typename NumericT>
+ void gmres_householder_reflect(VectorT & x, VectorT const & h, NumericT beta)
+ {
+ NumericT hT_in_x = viennacl::linalg::inner_prod(h, x);
+ x -= (beta * hT_in_x) * h;
+ }
+
+
+ /** @brief Implementation of a pipelined GMRES solver without preconditioner
+ *
+ * Following algorithm 2.1 proposed by Walker in "A Simpler GMRES", but uses classical Gram-Schmidt instead of modified Gram-Schmidt for better parallelization.
+ * Uses some pipelining techniques for minimizing host-device transfer
+ *
+ * @param A The system matrix
+ * @param rhs The load vector
+ * @param tag Solver configuration tag
+ * @param monitor A callback routine which is called at each GMRES restart
+ * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+ * @return The result vector
+ */
+ template <typename MatrixType, typename ScalarType>
+ viennacl::vector<ScalarType> pipelined_solve(MatrixType const & A,
+ viennacl::vector<ScalarType> const & rhs,
+ gmres_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<ScalarType> const &, ScalarType, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ viennacl::vector<ScalarType> residual(rhs);
+ viennacl::vector<ScalarType> result = viennacl::zero_vector<ScalarType>(rhs.size(), viennacl::traits::context(rhs));
+
+ viennacl::vector<ScalarType> device_krylov_basis(rhs.internal_size() * tag.krylov_dim(), viennacl::traits::context(rhs)); // not using viennacl::matrix here because of spurious padding in column number
+ viennacl::vector<ScalarType> device_buffer_R(tag.krylov_dim()*tag.krylov_dim(), viennacl::traits::context(rhs));
+ std::vector<ScalarType> host_buffer_R(device_buffer_R.size());
+
+ vcl_size_t buffer_size_per_vector = 128;
+ vcl_size_t num_buffer_chunks = 3;
+ viennacl::vector<ScalarType> device_inner_prod_buffer = viennacl::zero_vector<ScalarType>(num_buffer_chunks*buffer_size_per_vector, viennacl::traits::context(rhs)); // temporary buffer
+ viennacl::vector<ScalarType> device_r_dot_vk_buffer = viennacl::zero_vector<ScalarType>(buffer_size_per_vector * tag.krylov_dim(), viennacl::traits::context(rhs)); // holds result of first reduction stage for <r, v_k> on device
+ viennacl::vector<ScalarType> device_vi_in_vk_buffer = viennacl::zero_vector<ScalarType>(buffer_size_per_vector * tag.krylov_dim(), viennacl::traits::context(rhs)); // holds <v_i, v_k> for i=0..k-1 on device
+ viennacl::vector<ScalarType> device_values_xi_k = viennacl::zero_vector<ScalarType>(tag.krylov_dim(), viennacl::traits::context(rhs)); // holds values \xi_k = <r, v_k> on device
+ std::vector<ScalarType> host_r_dot_vk_buffer(device_r_dot_vk_buffer.size());
+ std::vector<ScalarType> host_values_xi_k(tag.krylov_dim());
+ std::vector<ScalarType> host_values_eta_k_buffer(tag.krylov_dim());
+ std::vector<ScalarType> host_update_coefficients(tag.krylov_dim());
+
+ ScalarType norm_rhs = viennacl::linalg::norm_2(residual);
+ ScalarType rho_0 = norm_rhs;
+ ScalarType rho = ScalarType(1);
+
+ tag.iters(0);
+
+ for (unsigned int restart_count = 0; restart_count <= tag.max_restarts(); ++restart_count)
+ {
+ //
+ // prepare restart:
+ //
+ if (restart_count > 0)
+ {
+ // compute new residual without introducing a temporary for A*x:
+ residual = viennacl::linalg::prod(A, result);
+ residual = rhs - residual;
+
+ rho_0 = viennacl::linalg::norm_2(residual);
+ }
+
+ if (rho_0 <= ScalarType(tag.abs_tolerance())) // trivial right hand side?
+ break;
+
+ residual /= rho_0;
+ rho = ScalarType(1);
+
+ // check for convergence:
+ if (rho_0 / norm_rhs < tag.tolerance() || rho_0 < tag.abs_tolerance())
+ break;
+
+ //
+ // minimize in Krylov basis:
+ //
+ vcl_size_t k = 0;
+ for (k = 0; k < static_cast<vcl_size_t>(tag.krylov_dim()); ++k)
+ {
+ if (k == 0)
+ {
+ // compute v0 = A*r and perform first reduction stage for ||v0||
+ viennacl::vector_range<viennacl::vector<ScalarType> > v0(device_krylov_basis, viennacl::range(0, rhs.size()));
+ viennacl::linalg::pipelined_gmres_prod(A, residual, v0, device_inner_prod_buffer);
+
+ // Normalize v_1 and compute first reduction stage for <r, v_0> in device_r_dot_vk_buffer:
+ viennacl::linalg::pipelined_gmres_normalize_vk(v0, residual,
+ device_buffer_R, k*tag.krylov_dim() + k,
+ device_inner_prod_buffer, device_r_dot_vk_buffer,
+ buffer_size_per_vector, k*buffer_size_per_vector);
+ }
+ else
+ {
+ // compute v0 = A*r and perform first reduction stage for ||v0||
+ viennacl::vector_range<viennacl::vector<ScalarType> > vk (device_krylov_basis, viennacl::range( k *rhs.internal_size(), k *rhs.internal_size() + rhs.size()));
+ viennacl::vector_range<viennacl::vector<ScalarType> > vk_minus_1(device_krylov_basis, viennacl::range((k-1)*rhs.internal_size(), (k-1)*rhs.internal_size() + rhs.size()));
+ viennacl::linalg::pipelined_gmres_prod(A, vk_minus_1, vk, device_inner_prod_buffer);
+
+ //
+ // Gram-Schmidt, stage 1: compute first reduction stage of <v_i, v_k>
+ //
+ viennacl::linalg::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, rhs.size(), rhs.internal_size(), k, device_vi_in_vk_buffer, buffer_size_per_vector);
+
+ //
+ // Gram-Schmidt, stage 2: compute second reduction stage of <v_i, v_k> and use that to compute v_k -= sum_i <v_i, v_k> v_i.
+ // Store <v_i, v_k> in R-matrix and compute first reduction stage for ||v_k||
+ //
+ viennacl::linalg::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, rhs.size(), rhs.internal_size(), k,
+ device_vi_in_vk_buffer,
+ device_buffer_R, tag.krylov_dim(),
+ device_inner_prod_buffer, buffer_size_per_vector);
+
+ //
+ // Normalize v_k and compute first reduction stage for <r, v_k> in device_r_dot_vk_buffer:
+ //
+ viennacl::linalg::pipelined_gmres_normalize_vk(vk, residual,
+ device_buffer_R, k*tag.krylov_dim() + k,
+ device_inner_prod_buffer, device_r_dot_vk_buffer,
+ buffer_size_per_vector, k*buffer_size_per_vector);
+ }
+ }
+
+ //
+ // Run reduction to obtain the values \xi_k = <r, v_k>.
+ // Note that unlike Algorithm 2.1 in Walker: "A Simpler GMRES", we do not update the residual
+ //
+ viennacl::fast_copy(device_r_dot_vk_buffer.begin(), device_r_dot_vk_buffer.end(), host_r_dot_vk_buffer.begin());
+ for (std::size_t i=0; i<k; ++i)
+ {
+ host_values_xi_k[i] = ScalarType(0);
+ for (std::size_t j=0; j<buffer_size_per_vector; ++j)
+ host_values_xi_k[i] += host_r_dot_vk_buffer[i*buffer_size_per_vector + j];
+ }
+
+ //
+ // Bring values in R back to host:
+ //
+ viennacl::fast_copy(device_buffer_R.begin(), device_buffer_R.end(), host_buffer_R.begin());
+
+ //
+ // Check for premature convergence: If the diagonal element drops too far below the first norm, we're done and restrict the Krylov size accordingly.
+ //
+ vcl_size_t full_krylov_dim = k; //needed for proper access to R
+ for (std::size_t i=0; i<k; ++i)
+ {
+ if (std::fabs(host_buffer_R[i + i*k]) < tag.tolerance() * host_buffer_R[0])
+ {
+ k = i;
+ break;
+ }
+ }
+
+
+ // Compute error estimator:
+ for (std::size_t i=0; i<k; ++i)
+ {
+ tag.iters( tag.iters() + 1 ); //increase iteration counter
+
+ // check for accumulation of round-off errors for poorly conditioned systems
+ if (host_values_xi_k[i] >= rho || host_values_xi_k[i] <= -rho)
+ {
+ k = i;
+ break; // restrict Krylov space at this point. No gain from using additional basis vectors, since orthogonality is lost.
+ }
+
+ // update error estimator
+ rho *= std::sin( std::acos(host_values_xi_k[i] / rho) );
+ }
+
+ //
+ // Solve minimization problem:
+ //
+ host_values_eta_k_buffer = host_values_xi_k;
+
+ for (int i2=static_cast<int>(k)-1; i2>-1; --i2)
+ {
+ vcl_size_t i = static_cast<vcl_size_t>(i2);
+ for (vcl_size_t j=static_cast<vcl_size_t>(i)+1; j<k; ++j)
+ host_values_eta_k_buffer[i] -= host_buffer_R[i + j*full_krylov_dim] * host_values_eta_k_buffer[j];
+
+ host_values_eta_k_buffer[i] /= host_buffer_R[i + i*full_krylov_dim];
+ }
+
+ //
+ // Update x += rho * z with z = \eta_0 * residual + sum_{i=0}^{k-1} \eta_{i+1} v_i
+ // Note that we have not updated the residual yet, hence this slightly modified as compared to the form given in Algorithm 2.1 in Walker: "A Simpler GMRES"
+ //
+ for (vcl_size_t i=0; i<k; ++i)
+ host_update_coefficients[i] = rho_0 * host_values_eta_k_buffer[i];
+
+ viennacl::fast_copy(host_update_coefficients.begin(), host_update_coefficients.end(), device_values_xi_k.begin()); //reuse device_values_xi_k_buffer here for simplicity
+
+ viennacl::linalg::pipelined_gmres_update_result(result, residual,
+ device_krylov_basis, rhs.size(), rhs.internal_size(),
+ device_values_xi_k, k);
+
+ tag.error( std::fabs(rho*rho_0 / norm_rhs) );
+
+ if (monitor && monitor(result, std::fabs(rho*rho_0 / norm_rhs), monitor_data))
+ break;
+ }
+
+ return result;
+ }
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ gmres_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ gmres_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ gmres_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ gmres_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */
+ template<typename NumericT>
+ viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A,
+ viennacl::vector<NumericT> const & rhs,
+ gmres_tag const & tag,
+ viennacl::linalg::no_precond,
+ bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data);
+ }
+
+
+ /** @brief Implementation of the GMRES solver.
+ *
+ * Following the algorithm proposed by Walker in "A Simpler GMRES"
+ *
+ * @param matrix The system matrix
+ * @param rhs The load vector
+ * @param tag Solver configuration tag
+ * @param precond A preconditioner. Precondition operation is done via member function apply()
+ * @param monitor A callback routine which is called at each GMRES restart
+ * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data
+ *
+ * @return The result vector
+ */
+ template<typename MatrixT, typename VectorT, typename PreconditionerT>
+ VectorT solve_impl(MatrixT const & matrix,
+ VectorT const & rhs,
+ gmres_tag const & tag,
+ PreconditionerT const & precond,
+ bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
+ void *monitor_data = NULL)
+ {
+ typedef typename viennacl::result_of::value_type<VectorT>::type NumericType;
+ typedef typename viennacl::result_of::cpu_value_type<NumericType>::type CPU_NumericType;
+
+ unsigned int problem_size = static_cast<unsigned int>(viennacl::traits::size(rhs));
+ VectorT result = rhs;
+ viennacl::traits::clear(result);
+
+ vcl_size_t krylov_dim = static_cast<vcl_size_t>(tag.krylov_dim());
+ if (problem_size < krylov_dim)
+ krylov_dim = problem_size; //A Krylov space larger than the matrix would lead to seg-faults (mathematically, error is certain to be zero already)
+
+ VectorT res = rhs;
+ VectorT v_k_tilde = rhs;
+ VectorT v_k_tilde_temp = rhs;
+
+ std::vector< std::vector<CPU_NumericType> > R(krylov_dim, std::vector<CPU_NumericType>(tag.krylov_dim()));
+ std::vector<CPU_NumericType> projection_rhs(krylov_dim);
+
+ std::vector<VectorT> householder_reflectors(krylov_dim, rhs);
+ std::vector<CPU_NumericType> betas(krylov_dim);
+
+ CPU_NumericType norm_rhs = viennacl::linalg::norm_2(rhs);
+
+ if (norm_rhs <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
+ return result;
+
+ tag.iters(0);
+
+ for (unsigned int it = 0; it <= tag.max_restarts(); ++it)
+ {
+ //
+ // (Re-)Initialize residual: r = b - A*x (without temporary for the result of A*x)
+ //
+ res = viennacl::linalg::prod(matrix, result); //initial guess zero
+ res = rhs - res;
+ precond.apply(res);
+
+ CPU_NumericType rho_0 = viennacl::linalg::norm_2(res);
+
+ //
+ // Check for premature convergence
+ //
+ if (rho_0 / norm_rhs < tag.tolerance() || rho_0 < tag.abs_tolerance()) // norm_rhs is known to be nonzero here
+ {
+ tag.error(rho_0 / norm_rhs);
+ return result;
+ }
+
+ //
+ // Normalize residual and set 'rho' to 1 as requested in 'A Simpler GMRES' by Walker and Zhou.
+ //
+ res /= rho_0;
+ CPU_NumericType rho = static_cast<CPU_NumericType>(1.0);
+
+
+ //
+ // Iterate up until maximal Krylove space dimension is reached:
+ //
+ vcl_size_t k = 0;
+ for (k = 0; k < krylov_dim; ++k)
+ {
+ tag.iters( tag.iters() + 1 ); //increase iteration counter
+
+ // prepare storage:
+ viennacl::traits::clear(R[k]);
+ viennacl::traits::clear(householder_reflectors[k]);
+
+ //compute v_k = A * v_{k-1} via Householder matrices
+ if (k == 0)
+ {
+ v_k_tilde = viennacl::linalg::prod(matrix, res);
+ precond.apply(v_k_tilde);
+ }
+ else
+ {
+ viennacl::traits::clear(v_k_tilde);
+ v_k_tilde[k-1] = CPU_NumericType(1);
+
+ //Householder rotations, part 1: Compute P_1 * P_2 * ... * P_{k-1} * e_{k-1}
+ for (int i = static_cast<int>(k)-1; i > -1; --i)
+ detail::gmres_householder_reflect(v_k_tilde, householder_reflectors[vcl_size_t(i)], betas[vcl_size_t(i)]);
+
+ v_k_tilde_temp = viennacl::linalg::prod(matrix, v_k_tilde);
+ precond.apply(v_k_tilde_temp);
+ v_k_tilde = v_k_tilde_temp;
+
+ //Householder rotations, part 2: Compute P_{k-1} * ... * P_{1} * v_k_tilde
+ for (vcl_size_t i = 0; i < k; ++i)
+ detail::gmres_householder_reflect(v_k_tilde, householder_reflectors[i], betas[i]);
+ }
+
+ //
+ // Compute Householder reflection for v_k_tilde such that all entries below k-th entry are zero:
+ //
+ CPU_NumericType rho_k_k = 0;
+ detail::gmres_setup_householder_vector(v_k_tilde, householder_reflectors[k], betas[k], rho_k_k, k);
+
+ //
+ // copy first k entries from v_k_tilde to R[k] in order to fill k-th column with result of
+ // P_k * v_k_tilde = (v[0], ... , v[k-1], norm(v), 0, 0, ...) =: (rho_{1,k}, rho_{2,k}, ..., rho_{k,k}, 0, ..., 0);
+ //
+ detail::gmres_copy_helper(v_k_tilde, R[k], k);
+ R[k][k] = rho_k_k;
+
+ //
+ // Update residual: r = P_k r
+ // Set zeta_k = r[k] including machine precision considerations: mathematically we have |r[k]| <= rho
+ // Set rho *= sin(acos(r[k] / rho))
+ //
+ detail::gmres_householder_reflect(res, householder_reflectors[k], betas[k]);
+
+ if (res[k] > rho) //machine precision reached
+ res[k] = rho;
+ if (res[k] < -rho) //machine precision reached
+ res[k] = -rho;
+ projection_rhs[k] = res[k];
+
+ rho *= std::sin( std::acos(projection_rhs[k] / rho) );
+
+ if (std::fabs(rho * rho_0 / norm_rhs) < tag.tolerance()) // Residual is sufficiently reduced, stop here
+ {
+ tag.error( std::fabs(rho*rho_0 / norm_rhs) );
+ ++k;
+ break;
+ }
+ } // for k
+
+ //
+ // Triangular solver stage:
+ //
+
+ for (int i2=static_cast<int>(k)-1; i2>-1; --i2)
+ {
+ vcl_size_t i = static_cast<vcl_size_t>(i2);
+ for (vcl_size_t j=i+1; j<k; ++j)
+ projection_rhs[i] -= R[j][i] * projection_rhs[j]; //R is transposed
+
+ projection_rhs[i] /= R[i][i];
+ }
+
+ //
+ // Note: 'projection_rhs' now holds the solution (eta_1, ..., eta_k)
+ //
+
+ res *= projection_rhs[0];
+
+ if (k > 0)
+ {
+ for (unsigned int i = 0; i < k-1; ++i)
+ res[i] += projection_rhs[i+1];
+ }
+
+ //
+ // Form z inplace in 'res' by applying P_1 * ... * P_{k}
+ //
+ for (int i=static_cast<int>(k)-1; i>=0; --i)
+ detail::gmres_householder_reflect(res, householder_reflectors[vcl_size_t(i)], betas[vcl_size_t(i)]);
+
+ res *= rho_0;
+ result += res; // x += rho_0 * z in the paper
+
+ //
+ // Check for convergence:
+ //
+ tag.error(std::fabs(rho*rho_0 / norm_rhs));
+
+ if (monitor && monitor(result, std::fabs(rho*rho_0 / norm_rhs), monitor_data))
+ break;
+
+ if ( tag.error() < tag.tolerance() )
+ return result;
+ }
+
+ return result;
+ }
+
+}
+
+template<typename MatrixT, typename VectorT, typename PreconditionerT>
+VectorT solve(MatrixT const & matrix, VectorT const & rhs, gmres_tag const & tag, PreconditionerT const & precond)
+{
+ return detail::solve_impl(matrix, rhs, tag, precond);
+}
+
+/** @brief Convenience overload for calling the preconditioned BiCGStab solver using types from the C++ STL.
+ *
+ * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly.
+ * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out.
+ */
+template<typename IndexT, typename NumericT, typename PreconditionerT>
+std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, gmres_tag const & tag, PreconditionerT const & precond)
+{
+ viennacl::compressed_matrix<NumericT> vcl_A;
+ viennacl::copy(A, vcl_A);
+
+ viennacl::vector<NumericT> vcl_rhs(rhs.size());
+ viennacl::copy(rhs, vcl_rhs);
+
+ viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
+
+ std::vector<NumericT> result(vcl_result.size());
+ viennacl::copy(vcl_result, result);
+ return result;
+}
+
+/** @brief Entry point for the unpreconditioned GMRES method.
+ *
+ * @param A The system matrix
+ * @param rhs Right hand side vector (load vector)
+ * @param tag A BiCGStab tag providing relative tolerances, etc.
+ */
+
+template<typename MatrixT, typename VectorT>
+VectorT solve(MatrixT const & A, VectorT const & rhs, gmres_tag const & tag)
+{
+ return solve(A, rhs, tag, no_precond());
+}
+
+
+
+template<typename VectorT>
+class gmres_solver
+{
+public:
+ typedef typename viennacl::result_of::cpu_value_type<VectorT>::type numeric_type;
+
+ gmres_solver(gmres_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {}
+
+ template<typename MatrixT, typename PreconditionerT>
+ VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const
+ {
+ if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account
+ {
+ VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
+ mod_rhs = b - mod_rhs;
+ VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_);
+ return init_guess_ + y;
+ }
+ return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_);
+ }
+
+
+ template<typename MatrixT>
+ VectorT operator()(MatrixT const & A, VectorT const & b) const
+ {
+ return operator()(A, b, viennacl::linalg::no_precond());
+ }
+
+ /** @brief Specifies an initial guess for the iterative solver.
+ *
+ * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
+ */
+ void set_initial_guess(VectorT const & x) { init_guess_ = x; }
+
+ /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor.
+ *
+ * The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument.
+ * The third argument is a pointer to user-defined data, through which additional information can be passed.
+ * This pointer needs to be set with set_monitor_data. If not set, NULL is passed.
+ * If the montior function returns true, the solver terminates (either convergence or divergence).
+ */
+ void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data)
+ {
+ monitor_callback_ = monitor_fun;
+ user_data_ = user_data;
+ }
+
+ /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */
+ gmres_tag const & tag() const { return tag_; }
+
+private:
+ gmres_tag tag_;
+ VectorT init_guess_;
+ bool (*monitor_callback_)(VectorT const &, numeric_type, void *);
+ void *user_data_;
+};
+
+
+}
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
new file mode 100644
index 0000000..43ca928
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
@@ -0,0 +1,66 @@
+#ifndef VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/hankel_matrix_operations.hpp
+ @brief Implementations of operations using hankel_matrix. Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/fft.hpp"
+#include "viennacl/linalg/toeplitz_matrix_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication with a hankel_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param A The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::hankel_matrix<NumericT, AlignmentV> const & A,
+ viennacl::vector_base<NumericT> const & vec,
+ viennacl::vector_base<NumericT> & result)
+{
+ assert(A.size1() == result.size() && bool("Dimension mismatch"));
+ assert(A.size2() == vec.size() && bool("Dimension mismatch"));
+
+ prod_impl(A.elements(), vec, result);
+ viennacl::linalg::reverse(result);
+}
+
+} //namespace linalg
+
+
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
new file mode 100644
index 0000000..78bd150
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
@@ -0,0 +1,1123 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_AMG_OPERATIONS_HPP
+#define VIENNACL_LINALG_HOST_BASED_AMG_OPERATIONS_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file host_based/amg_operations.hpp
+ @brief Implementations of routines for AMG using the CPU on the host (with OpenMP if enabled).
+*/
+
+#include <cstdlib>
+#include <cmath>
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+
+#include <map>
+#include <set>
+#include <functional>
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace amg
+{
+
+
+///////////////////////////////////////////
+
+/** @brief Routine for taking all connections in the matrix as strong */
+template<typename NumericT>
+void amg_influence_trivial(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+
+ unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+ unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+ unsigned int *influences_values_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_values_.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ influences_row_ptr[i] = A_row_buffer[i];
+ influences_values_ptr[i] = A_row_buffer[i+1] - A_row_buffer[i];
+ }
+ influences_row_ptr[A.size1()] = A_row_buffer[A.size1()];
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i=0; i<long(A.nnz()); ++i)
+ influences_id_ptr[i] = A_col_buffer[i];
+}
+
+
+/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */
+template<typename NumericT>
+void amg_influence_advanced(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+ unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+
+ //
+ // Step 1: Scan influences in order to allocate the necessary memory
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ unsigned int row_start = A_row_buffer[i];
+ unsigned int row_stop = A_row_buffer[i+1];
+ NumericT diag = 0;
+ NumericT largest_positive = 0;
+ NumericT largest_negative = 0;
+ unsigned int num_influences = 0;
+
+ // obtain diagonal element as well as maximum positive and negative off-diagonal entries
+ for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+ {
+ unsigned int col = A_col_buffer[nnz_index];
+ NumericT value = A_elements[nnz_index];
+
+ if (col == i)
+ diag = value;
+ else if (value > largest_positive)
+ largest_positive = value;
+ else if (value < largest_negative)
+ largest_negative = value;
+ }
+
+ if (largest_positive <= 0 && largest_negative >= 0) // no offdiagonal entries
+ {
+ influences_row_ptr[i] = 0;
+ continue;
+ }
+
+ // Find all points that strongly influence current point (Yang, p.5)
+ //std::cout << "Looking for strongly influencing points for point " << i << std::endl;
+ for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+ {
+ unsigned int col = A_col_buffer[nnz_index];
+
+ if (i == col)
+ continue;
+
+ NumericT value = A_elements[nnz_index];
+
+ if ( (diag > 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_negative)
+ || (diag < 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_positive))
+ {
+ ++num_influences;
+ }
+ }
+
+ influences_row_ptr[i] = num_influences;
+ }
+
+ //
+ // Step 2: Exclusive scan on number of influences to obtain CSR-like datastructure
+ //
+ unsigned int current_entry = 0;
+ for (std::size_t i=0; i<A.size1(); ++i)
+ {
+ unsigned int tmp = influences_row_ptr[i];
+ influences_row_ptr[i] = current_entry;
+ current_entry += tmp;
+ }
+ influences_row_ptr[A.size1()] = current_entry;
+
+
+ //
+ // Step 3: Write actual influences
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ unsigned int i = static_cast<unsigned int>(i2);
+ unsigned int row_start = A_row_buffer[i];
+ unsigned int row_stop = A_row_buffer[i+1];
+ NumericT diag = 0;
+ NumericT largest_positive = 0;
+ NumericT largest_negative = 0;
+
+ // obtain diagonal element as well as maximum positive and negative off-diagonal entries
+ for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+ {
+ unsigned int col = A_col_buffer[nnz_index];
+ NumericT value = A_elements[nnz_index];
+
+ if (col == i)
+ diag = value;
+ else if (value > largest_positive)
+ largest_positive = value;
+ else if (value < largest_negative)
+ largest_negative = value;
+ }
+
+ if (largest_positive <= 0 && largest_negative >= 0) // no offdiagonal entries
+ continue;
+
+ // Find all points that strongly influence current point (Yang, p.5)
+ //std::cout << "Looking for strongly influencing points for point " << i << std::endl;
+ unsigned int *influences_id_write_ptr = influences_id_ptr + influences_row_ptr[i];
+ for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+ {
+ unsigned int col = A_col_buffer[nnz_index];
+
+ if (i == col)
+ continue;
+
+ NumericT value = A_elements[nnz_index];
+
+ if ( (diag > 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_negative)
+ || (diag < 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_positive))
+ {
+ //std::cout << " - Adding influence from point " << col << std::endl;
+ *influences_id_write_ptr = col;
+ ++influences_id_write_ptr;
+ }
+ }
+ }
+
+}
+
+
+/** @brief Dispatcher for influence processing */
+template<typename NumericT>
+void amg_influence(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ // TODO: dispatch based on influence tolerance provided
+ amg_influence_trivial(A, amg_context, tag);
+}
+
+
+
+/** @brief Assign IDs to coarse points */
+inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context)
+{
+ unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+ unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle());
+
+ unsigned int coarse_id = 0;
+ for (vcl_size_t i=0; i<amg_context.coarse_id_.size(); ++i)
+ {
+ //assert(point_types_ptr[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED && bool("Logic error in enumerate_coarse_points(): Undecided points detected!"));
+
+ if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ coarse_id_ptr[i] = coarse_id++;
+ }
+
+ //std::cout << "Coarse nodes after enumerate_coarse_points(): " << coarse_id << std::endl;
+ amg_context.num_coarse_ = coarse_id;
+}
+
+
+
+
+//////////////////////////////////////
+
+
+/** @brief Helper struct for sequential classical one-pass coarsening */
+struct amg_id_influence
+{
+ amg_id_influence(std::size_t id2, std::size_t influences2) : id(static_cast<unsigned int>(id2)), influences(static_cast<unsigned int>(influences2)) {}
+
+ unsigned int id;
+ unsigned int influences;
+};
+
+inline bool operator>(amg_id_influence const & a, amg_id_influence const & b)
+{
+ if (a.influences > b.influences)
+ return true;
+ if (a.influences == b.influences)
+ return a.id > b.id;
+ return false;
+}
+
+/** @brief Classical (RS) one-pass coarsening. Single-Threaded! (VIENNACL_AMG_COARSE_CLASSIC_ONEPASS)
+*
+* @param A Operator matrix for the respective level
+* @param amg_context AMG datastructure object for the grid hierarchy
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_classic_onepass(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+ unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+ unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+ unsigned int *influences_values_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_values_.handle());
+
+ std::set<amg_id_influence, std::greater<amg_id_influence> > points_by_influences;
+
+ amg_influence_advanced(A, amg_context, tag);
+
+ for (std::size_t i=0; i<A.size1(); ++i)
+ points_by_influences.insert(amg_id_influence(i, influences_values_ptr[i]));
+
+ //std::cout << "Starting coarsening process..." << std::endl;
+
+ while (!points_by_influences.empty())
+ {
+ amg_id_influence point = *(points_by_influences.begin());
+
+ // remove point from queue:
+ points_by_influences.erase(points_by_influences.begin());
+
+ //std::cout << "Working on point " << point.id << std::endl;
+
+ // point is already coarse or fine point, continue;
+ if (point_types_ptr[point.id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ continue;
+
+ //std::cout << " Setting point " << point.id << " to a coarse point." << std::endl;
+ // make this a coarse point:
+ point_types_ptr[point.id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+
+ // Set strongly influenced points to fine points:
+ unsigned int j_stop = influences_row_ptr[point.id + 1];
+ for (unsigned int j = influences_row_ptr[point.id]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id_ptr[j];
+
+ //std::cout << "Checking point " << influenced_point_id << std::endl;
+ if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ continue;
+
+ //std::cout << " Setting point " << influenced_point_id << " to a fine point." << std::endl;
+ point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+
+ // add one to influence measure for all undecided points strongly influencing this fine point.
+ unsigned int k_stop = influences_row_ptr[influenced_point_id + 1];
+ for (unsigned int k = influences_row_ptr[influenced_point_id]; k < k_stop; ++k)
+ {
+ unsigned int influenced_influenced_point_id = influences_id_ptr[k];
+ if (point_types_ptr[influenced_influenced_point_id] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ {
+ // grab and remove from set, increase influence counter, store back:
+ amg_id_influence point_to_find(influenced_influenced_point_id, influences_values_ptr[influenced_influenced_point_id]);
+ points_by_influences.erase(point_to_find);
+
+ point_to_find.influences += 1;
+ influences_values_ptr[influenced_influenced_point_id] += 1; // for consistency
+
+ points_by_influences.insert(point_to_find);
+ }
+ } //for
+ } // for
+
+ } // while
+
+ viennacl::linalg::host_based::amg::enumerate_coarse_points(amg_context);
+}
+
+
+//////////////////////////
+
+
+/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1
+*
+* @param A Operator matrix for the respective level
+* @param amg_context AMG datastructure object for the grid hierarchy
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_sequential(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+ unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+ unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+ unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+
+ for (unsigned int i=0; i<static_cast<unsigned int>(A.size1()); ++i)
+ {
+ // check if node has no aggregates next to it (MIS-2)
+ bool is_new_coarse_node = true;
+
+ // Set strongly influenced points to fine points:
+ unsigned int j_stop = influences_row_ptr[i + 1];
+ for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id_ptr[j];
+ if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point
+ {
+ is_new_coarse_node = false;
+ break;
+ }
+ }
+
+ if (is_new_coarse_node)
+ {
+ // make all strongly influenced neighbors fine points:
+ for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id_ptr[j];
+ point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+ }
+
+ //std::cout << "Setting new coarse node: " << i << std::endl;
+ // Note: influences may include diagonal element, so it's important to *first* set fine points above before setting the coarse information here
+ point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+ }
+ }
+}
+
+
+
+/** @brief AG (aggregation based) coarsening, multi-threaded version of stage 1 using parallel maximum independent sets
+*
+* @param A Operator matrix for the respective level
+* @param amg_context AMG datastructure object for the grid hierarchy
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+ unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+ unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+ unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+
+ std::vector<unsigned int> random_weights(A.size1());
+ for (std::size_t i=0; i<random_weights.size(); ++i)
+ random_weights[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1());
+
+ std::size_t num_threads = 1;
+#ifdef VIENNACL_WITH_OPENMP
+ num_threads = omp_get_max_threads();
+#endif
+
+ viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A));
+
+ unsigned int *work_state_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_state.handle());
+ unsigned int *work_random_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_random.handle());
+ unsigned int *work_index_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_index.handle());
+
+ viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A));
+
+ unsigned int *work_state2_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_state2.handle());
+ unsigned int *work_random2_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_random2.handle());
+ unsigned int *work_index2_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_index2.handle());
+
+
+ unsigned int num_undecided = static_cast<unsigned int>(A.size1());
+ unsigned int pmis_iters = 0;
+ while (num_undecided > 0)
+ {
+ ++pmis_iters;
+
+ //
+ // init temporary work data:
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ unsigned int i = static_cast<unsigned int>(i2);
+ switch (point_types_ptr[i])
+ {
+ case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: work_state_ptr[i] = 1; break;
+ case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE: work_state_ptr[i] = 0; break;
+ case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE: work_state_ptr[i] = 2; break;
+ default:
+ throw std::runtime_error("Unexpected state encountered in MIS2 setup for AMG.");
+ }
+
+ work_random_ptr[i] = random_weights[i];
+ work_index_ptr[i] = i;
+ }
+
+
+ //
+ // Propagate maximum tuple twice
+ //
+ for (unsigned int r = 0; r < 2; ++r)
+ {
+ // max operation
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ unsigned int i = static_cast<unsigned int>(i2);
+ // load
+ unsigned int state = work_state_ptr[i];
+ unsigned int random = work_random_ptr[i];
+ unsigned int index = work_index_ptr[i];
+
+ // max
+ unsigned int j_stop = influences_row_ptr[i + 1];
+ for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id_ptr[j];
+
+ // lexigraphical triple-max (not particularly pretty, but does the job):
+ if (state < work_state_ptr[influenced_point_id])
+ {
+ state = work_state_ptr[influenced_point_id];
+ random = work_random_ptr[influenced_point_id];
+ index = work_index_ptr[influenced_point_id];
+ }
+ else if (state == work_state_ptr[influenced_point_id])
+ {
+ if (random < work_random_ptr[influenced_point_id])
+ {
+ state = work_state_ptr[influenced_point_id];
+ random = work_random_ptr[influenced_point_id];
+ index = work_index_ptr[influenced_point_id];
+ }
+ else if (random == work_random_ptr[influenced_point_id])
+ {
+ if (index < work_index_ptr[influenced_point_id])
+ {
+ state = work_state_ptr[influenced_point_id];
+ random = work_random_ptr[influenced_point_id];
+ index = work_index_ptr[influenced_point_id];
+ }
+ } // max(random)
+ } // max(state)
+ } // for
+
+ // store
+ work_state2_ptr[i] = state;
+ work_random2_ptr[i] = random;
+ work_index2_ptr[i] = index;
+ }
+
+ // copy work array
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ unsigned int i = static_cast<unsigned int>(i2);
+ work_state_ptr[i] = work_state2_ptr[i];
+ work_random_ptr[i] = work_random2_ptr[i];
+ work_index_ptr[i] = work_index2_ptr[i];
+ }
+ }
+
+ //
+ // mark MIS and non-MIS nodes:
+ //
+ std::vector<unsigned int> thread_buffer(num_threads);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ unsigned int i = static_cast<unsigned int>(i2);
+ unsigned int max_state = work_state_ptr[i];
+ unsigned int max_index = work_index_ptr[i];
+
+ if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ {
+ if (i == max_index) // make this a MIS node
+ point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+ else if (max_state == 2) // mind the mapping of viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above!
+ point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+ else
+#ifdef VIENNACL_WITH_OPENMP
+ thread_buffer[omp_get_thread_num()] += 1;
+#else
+ thread_buffer[0] += 1;
+#endif
+ }
+ }
+
+ num_undecided = 0;
+ for (std::size_t i=0; i<thread_buffer.size(); ++i)
+ num_undecided += thread_buffer[i];
+ } // while
+
+ // consistency with sequential MIS: reset state for non-coarse points, so that coarse indices are correctly picked up later
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i=0; i<static_cast<long>(A.size1()); ++i)
+ if (point_types_ptr[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED;
+
+}
+
+
+
+/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG)
+*
+* @param A Operator matrix for the respective level
+* @param amg_context AMG datastructure object for the grid hierarchy
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+ unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+ unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+ unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle());
+
+ amg_influence_trivial(A, amg_context, tag);
+
+ //
+ // Stage 1: Build aggregates:
+ //
+ if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_AGGREGATION) amg_coarse_ag_stage1_sequential(A, amg_context, tag);
+ if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION) amg_coarse_ag_stage1_mis2(A, amg_context, tag);
+
+ viennacl::linalg::host_based::amg::enumerate_coarse_points(amg_context);
+
+ //
+ // Stage 2: Propagate coarse aggregate indices to neighbors:
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ unsigned int i = static_cast<unsigned int>(i2);
+ if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ {
+ unsigned int coarse_index = coarse_id_ptr[i];
+
+ unsigned int j_stop = influences_row_ptr[i + 1];
+ for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id_ptr[j];
+ coarse_id_ptr[influenced_point_id] = coarse_index; // Set aggregate index for fine point
+
+ if (influenced_point_id != i) // Note: Any write races between threads are harmless here
+ point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+ }
+ }
+ }
+
+
+ //
+ // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
+ {
+ unsigned int i = static_cast<unsigned int>(i2);
+ if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ {
+ unsigned int j_stop = influences_row_ptr[i + 1];
+ for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id_ptr[j];
+ if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point
+ {
+ //std::cout << "Setting fine node " << i << " to be aggregated with node " << *influence_iter << "/" << pointvector.get_coarse_index(*influence_iter) << std::endl;
+ coarse_id_ptr[i] = coarse_id_ptr[influenced_point_id];
+ break;
+ }
+ }
+ }
+ }
+
+ //
+ // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3)
+ // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution).
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i=0; i<static_cast<long>(A.size1()); ++i)
+ if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+
+}
+
+
+
+
+/** @brief Entry point and dispatcher for coarsening procedures
+*
+* @param A Operator matrix for the respective level
+* @param amg_context AMG datastructure object for the grid hierarchy
+* @param tag AMG preconditioner tag
+*/
+template<typename MatrixT>
+void amg_coarse(MatrixT & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ switch (tag.get_coarsening_method())
+ {
+ case viennacl::linalg::AMG_COARSENING_METHOD_ONEPASS: amg_coarse_classic_onepass(A, amg_context, tag); break;
+ case viennacl::linalg::AMG_COARSENING_METHOD_AGGREGATION:
+ case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break;
+ //default: throw std::runtime_error("not implemented yet");
+ }
+}
+
+
+
+
+////////////////////////////////////// Interpolation /////////////////////////////
+
+
+/** @brief Direct interpolation. Multi-threaded! (VIENNACL_AMG_INTERPOL_DIRECT)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_interpol_direct(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle());
+ unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle());
+ unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle());
+ unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle());
+
+ P.resize(A.size1(), amg_context.num_coarse_, false);
+
+ std::vector<std::map<unsigned int, NumericT> > P_setup(A.size1());
+
+ // Iterate over all points to build the interpolation matrix row-by-row
+ // Interpolation for coarse points is immediate using '1'.
+ // Interpolation for fine points is set up via corresponding row weights (cf. Yang paper, p. 14)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row2=0; row2<static_cast<long>(A.size1()); ++row2)
+ {
+ unsigned int row = static_cast<unsigned int>(row2);
+ std::map<unsigned int, NumericT> & P_setup_row = P_setup[row];
+ //std::cout << "Row " << row << ": " << std::endl;
+ if (point_types_ptr[row] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ {
+ //std::cout << " Setting value 1.0 at " << coarse_id_ptr[row] << std::endl;
+ P_setup_row[coarse_id_ptr[row]] = NumericT(1);
+ }
+ else if (point_types_ptr[row] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE)
+ {
+ //std::cout << "Building interpolant for fine point " << row << std::endl;
+
+ NumericT row_sum = 0;
+ NumericT row_coarse_sum = 0;
+ NumericT diag = 0;
+
+ // Row sum of coefficients (without diagonal) and sum of influencing coarse point coefficients has to be computed
+ unsigned int row_A_start = A_row_buffer[row];
+ unsigned int row_A_end = A_row_buffer[row + 1];
+ unsigned int const *influence_iter = influences_id_ptr + influences_row_ptr[row];
+ unsigned int const *influence_end = influences_id_ptr + influences_row_ptr[row + 1];
+ for (unsigned int index = row_A_start; index < row_A_end; ++index)
+ {
+ unsigned int col = A_col_buffer[index];
+ NumericT value = A_elements[index];
+
+ if (col == row)
+ {
+ diag = value;
+ continue;
+ }
+ else if (point_types_ptr[col] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ {
+ // Note: One increment is sufficient, because influence_iter traverses an ordered subset of the column indices in this row
+ while (influence_iter != influence_end && *influence_iter < col)
+ ++influence_iter;
+
+ if (influence_iter != influence_end && *influence_iter == col)
+ row_coarse_sum += value;
+ }
+
+ row_sum += value;
+ }
+
+ NumericT temp_res = -row_sum/(row_coarse_sum*diag);
+ //std::cout << "row_sum: " << row_sum << ", row_coarse_sum: " << row_coarse_sum << ", diag: " << diag << std::endl;
+
+ if (std::fabs(temp_res) > 1e-2 * std::fabs(diag))
+ {
+ // Iterate over all strongly influencing points to build the interpolant
+ influence_iter = influences_id_ptr + influences_row_ptr[row];
+ for (unsigned int index = row_A_start; index < row_A_end; ++index)
+ {
+ unsigned int col = A_col_buffer[index];
+ if (point_types_ptr[col] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ continue;
+ NumericT value = A_elements[index];
+
+ // Advance to correct influence metric:
+ while (influence_iter != influence_end && *influence_iter < col)
+ ++influence_iter;
+
+ if (influence_iter != influence_end && *influence_iter == col)
+ {
+ //std::cout << " Setting entry " << temp_res * value << " at " << coarse_id_ptr[col] << " for point " << col << std::endl;
+ P_setup_row[coarse_id_ptr[col]] = temp_res * value;
+ }
+ }
+ }
+
+ // TODO truncate interpolation if specified by the user.
+ (void)tag;
+ }
+ else
+ throw std::runtime_error("Logic error in direct interpolation: Point is neither coarse-point nor fine-point!");
+ }
+
+ // TODO: P_setup can be avoided without sacrificing parallelism.
+ viennacl::tools::sparse_matrix_adapter<NumericT> P_adapter(P_setup, P.size1(), P.size2());
+ viennacl::copy(P_adapter, P);
+}
+
+
+/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_AG)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_ag(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+ P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+ NumericT * P_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(P.handle());
+ unsigned int * P_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(P.handle1());
+ unsigned int * P_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(P.handle2());
+
+ unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle());
+
+ // Build interpolation matrix:
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row2 = 0; row2 < long(A.size1()); ++row2)
+ {
+ unsigned int row = static_cast<unsigned int>(row2);
+ P_elements[row] = NumericT(1);
+ P_row_buffer[row] = row;
+ P_col_buffer[row] = coarse_id_ptr[row];
+ }
+ P_row_buffer[A.size1()] = static_cast<unsigned int>(A.size1()); // don't forget finalizer
+
+ P.generate_row_block_information();
+}
+
+
+/** @brief Smoothed aggregation interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_sa(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+ viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+ // form tentative operator:
+ amg_interpol_ag(A, P_tentative, amg_context, tag);
+
+ unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+ NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+
+ viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A));
+ unsigned int * Jacobi_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(Jacobi.handle1());
+ unsigned int * Jacobi_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(Jacobi.handle2());
+ NumericT * Jacobi_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(Jacobi.handle());
+
+
+ // Build Jacobi matrix:
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row2=0; row2<static_cast<long>(A.size1()); ++row2)
+ {
+ unsigned int row = static_cast<unsigned int>(row2);
+ unsigned int row_begin = A_row_buffer[row];
+ unsigned int row_end = A_row_buffer[row+1];
+
+ Jacobi_row_buffer[row] = row_begin;
+
+ // Step 1: Extract diagonal:
+ NumericT diag = 0;
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ if (A_col_buffer[j] == row)
+ {
+ diag = A_elements[j];
+ break;
+ }
+ }
+
+ // Step 2: Write entries:
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col_index = A_col_buffer[j];
+ Jacobi_col_buffer[j] = col_index;
+
+ if (col_index == row)
+ Jacobi_elements[j] = NumericT(1) - NumericT(tag.get_jacobi_weight());
+ else
+ Jacobi_elements[j] = - NumericT(tag.get_jacobi_weight()) * A_elements[j] / diag;
+ }
+ }
+ Jacobi_row_buffer[A.size1()] = static_cast<unsigned int>(Jacobi.nnz()); // don't forget finalizer
+
+ P = viennacl::linalg::prod(Jacobi, P_tentative);
+
+ P.generate_row_block_information();
+}
+
+
+/** @brief Dispatcher for building the interpolation matrix
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename MatrixT>
+void amg_interpol(MatrixT const & A,
+ MatrixT & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ switch (tag.get_interpolation_method())
+ {
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_DIRECT: amg_interpol_direct (A, P, amg_context, tag); break;
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break;
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa (A, P, amg_context, tag); break;
+ default: throw std::runtime_error("Not implemented yet!");
+ }
+}
+
+
+/** @brief Computes B = trans(A).
+ *
+ * To be replaced by native functionality in ViennaCL.
+ */
+template<typename NumericT>
+void amg_transpose(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & B)
+{
+ NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ // initialize datastructures for B:
+ B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+ NumericT * B_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle());
+ unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1());
+ unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2());
+
+ // prepare uninitialized B_row_buffer:
+ for (std::size_t i = 0; i < B.size1(); ++i)
+ B_row_buffer[i] = 0;
+
+ //
+ // Stage 1: Compute pattern for B
+ //
+ for (std::size_t row = 0; row < A.size1(); ++row)
+ {
+ unsigned int row_start = A_row_buffer[row];
+ unsigned int row_stop = A_row_buffer[row+1];
+
+ for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+ B_row_buffer[A_col_buffer[nnz_index]] += 1;
+ }
+
+ // Bring row-start array in place using inclusive-scan:
+ unsigned int offset = B_row_buffer[0];
+ B_row_buffer[0] = 0;
+ for (std::size_t row = 1; row < B.size1(); ++row)
+ {
+ unsigned int tmp = B_row_buffer[row];
+ B_row_buffer[row] = offset;
+ offset += tmp;
+ }
+ B_row_buffer[B.size1()] = offset;
+
+ //
+ // Stage 2: Fill with data
+ //
+
+ std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row
+
+ for (std::size_t row = 0; row < A.size1(); ++row)
+ {
+ //std::cout << "Row " << row << ": ";
+ unsigned int row_start = A_row_buffer[row];
+ unsigned int row_stop = A_row_buffer[row+1];
+
+ for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+ {
+ unsigned int col_in_A = A_col_buffer[nnz_index];
+ unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A];
+ B_col_buffer[B_nnz_index] = static_cast<unsigned int>(row);
+ B_elements[B_nnz_index] = A_elements[nnz_index];
+ ++B_row_offsets[col_in_A];
+ //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index];
+ }
+ }
+
+ // Step 3: Make datastructure consistent (row blocks!)
+ B.generate_row_block_information();
+}
+
+/** Assign sparse matrix A to dense matrix B */
+template<typename NumericT, unsigned int AlignmentV>
+void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ viennacl::matrix_base<NumericT> & B)
+{
+ NumericT const * A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ NumericT * B_elements = detail::extract_raw_pointer<NumericT>(B.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int row_stop = A_row_buffer[row+1];
+
+ for (unsigned int nnz_index = A_row_buffer[row]; nnz_index < row_stop; ++nnz_index)
+ B_elements[static_cast<unsigned int>(row) * static_cast<unsigned int>(B.internal_size2()) + A_col_buffer[nnz_index]] = A_elements[nnz_index];
+ }
+
+}
+
+/** @brief Damped Jacobi Smoother (CUDA version)
+*
+* @param iterations Number of smoother iterations
+* @param A Operator matrix for the smoothing
+* @param x The vector smoothing is applied to
+* @param x_backup (Different) Vector holding the same values as x
+* @param rhs_smooth The right hand side of the equation for the smoother
+* @param weight Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration
+*/
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+ compressed_matrix<NumericT> const & A,
+ vector<NumericT> & x,
+ vector<NumericT> & x_backup,
+ vector<NumericT> const & rhs_smooth,
+ NumericT weight)
+{
+
+ NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+ NumericT const * rhs_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(rhs_smooth.handle());
+
+ NumericT * x_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(x.handle());
+ NumericT const * x_old_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(x_backup.handle());
+
+ for (unsigned int i=0; i<iterations; ++i)
+ {
+ x_backup = x;
+
+ #ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+ #endif
+ for (long row2 = 0; row2 < static_cast<long>(A.size1()); ++row2)
+ {
+ unsigned int row = static_cast<unsigned int>(row2);
+ unsigned int col_end = A_row_buffer[row+1];
+
+ NumericT sum = NumericT(0);
+ NumericT diag = NumericT(1);
+ for (unsigned int index = A_row_buffer[row]; index != col_end; ++index)
+ {
+ unsigned int col = A_col_buffer[index];
+ if (col == row)
+ diag = A_elements[index];
+ else
+ sum += A_elements[index] * x_old_elements[col];
+ }
+
+ x_elements[row] = weight * (rhs_elements[row] - sum) / diag + (NumericT(1) - weight) * x_old_elements[row];
+ }
+ }
+}
+
+} //namespace amg
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
new file mode 100644
index 0000000..8ddb4c1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
@@ -0,0 +1,149 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_COMMON_HPP_
+#define VIENNACL_LINALG_HOST_BASED_COMMON_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/common.hpp
+ @brief Common routines for single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+
+template<typename ResultT, typename VectorT>
+ResultT * extract_raw_pointer(VectorT & vec)
+{
+ return reinterpret_cast<ResultT *>(viennacl::traits::ram_handle(vec).get());
+}
+
+template<typename ResultT, typename VectorT>
+ResultT const * extract_raw_pointer(VectorT const & vec)
+{
+ return reinterpret_cast<ResultT const *>(viennacl::traits::ram_handle(vec).get());
+}
+
+/** @brief Helper class for accessing a strided subvector of a larger vector. */
+template<typename NumericT>
+class vector_array_wrapper
+{
+public:
+ typedef NumericT value_type;
+
+ vector_array_wrapper(value_type * A,
+ vcl_size_t start,
+ vcl_size_t inc)
+ : A_(A),
+ start_(start),
+ inc_(inc) {}
+
+ value_type & operator()(vcl_size_t i) { return A_[i * inc_ + start_]; }
+
+private:
+ value_type * A_;
+ vcl_size_t start_;
+ vcl_size_t inc_;
+};
+
+
+/** @brief Helper array for accessing a strided submatrix embedded in a larger matrix. */
+template<typename NumericT, typename LayoutT, bool is_transposed>
+class matrix_array_wrapper
+{
+ public:
+ typedef NumericT value_type;
+
+ matrix_array_wrapper(value_type * A,
+ vcl_size_t start1, vcl_size_t start2,
+ vcl_size_t inc1, vcl_size_t inc2,
+ vcl_size_t internal_size1, vcl_size_t internal_size2)
+ : A_(A),
+ start1_(start1), start2_(start2),
+ inc1_(inc1), inc2_(inc2),
+ internal_size1_(internal_size1), internal_size2_(internal_size2) {}
+
+ value_type & operator()(vcl_size_t i, vcl_size_t j)
+ {
+ return A_[LayoutT::mem_index(i * inc1_ + start1_,
+ j * inc2_ + start2_,
+ internal_size1_, internal_size2_)];
+ }
+
+ // convenience overloads to address signed index types for OpenMP:
+ value_type & operator()(vcl_size_t i, long j) { return operator()(i, static_cast<vcl_size_t>(j)); }
+ value_type & operator()(long i, vcl_size_t j) { return operator()(static_cast<vcl_size_t>(i), j); }
+ value_type & operator()(long i, long j) { return operator()(static_cast<vcl_size_t>(i), static_cast<vcl_size_t>(j)); }
+
+ private:
+ value_type * A_;
+ vcl_size_t start1_, start2_;
+ vcl_size_t inc1_, inc2_;
+ vcl_size_t internal_size1_, internal_size2_;
+};
+
+/** \cond */
+template<typename NumericT, typename LayoutT>
+class matrix_array_wrapper<NumericT, LayoutT, true>
+{
+public:
+ typedef NumericT value_type;
+
+ matrix_array_wrapper(value_type * A,
+ vcl_size_t start1, vcl_size_t start2,
+ vcl_size_t inc1, vcl_size_t inc2,
+ vcl_size_t internal_size1, vcl_size_t internal_size2)
+ : A_(A),
+ start1_(start1), start2_(start2),
+ inc1_(inc1), inc2_(inc2),
+ internal_size1_(internal_size1), internal_size2_(internal_size2) {}
+
+ value_type & operator()(vcl_size_t i, vcl_size_t j)
+ {
+ //swapping row and column indices here
+ return A_[LayoutT::mem_index(j * inc1_ + start1_,
+ i * inc2_ + start2_,
+ internal_size1_, internal_size2_)];
+ }
+
+ // convenience overloads to address signed index types for OpenMP:
+ value_type & operator()(vcl_size_t i, long j) { return operator()(i, static_cast<vcl_size_t>(j)); }
+ value_type & operator()(long i, vcl_size_t j) { return operator()(static_cast<vcl_size_t>(i), j); }
+ value_type & operator()(long i, long j) { return operator()(static_cast<vcl_size_t>(i), static_cast<vcl_size_t>(j)); }
+
+private:
+ value_type * A_;
+ vcl_size_t start1_, start2_;
+ vcl_size_t inc1_, inc2_;
+ vcl_size_t internal_size1_, internal_size2_;
+};
+/** \endcond */
+
+} //namespace detail
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[49/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cu
new file mode 100644
index 0000000..617b128
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cu
@@ -0,0 +1,297 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *index,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *index,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ float c, float s)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ double c, double s)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cpp
new file mode 100644
index 0000000..bc2c095
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cpp
@@ -0,0 +1,231 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y)
+{
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+ viennacl::backend::mem_handle A_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+ viennacl::matrix_base<float> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ v2 *= beta->value_float;
+ if (A->trans == ViennaCLTrans)
+ v2 += alpha->value_float * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha->value_float * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+ viennacl::matrix_base<double> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ v2 *= beta->value_double;
+ if (A->trans == ViennaCLTrans)
+ v2 += alpha->value_double * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha->value_double * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo)
+{
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle A_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+
+ viennacl::matrix_base<float> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ if (A->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+ }
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+
+ viennacl::matrix_base<double> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ if (A->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A)
+{
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+ viennacl::backend::mem_handle A_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+ viennacl::matrix_base<float> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+
+ mat += alpha->value_float * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+ }
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+ viennacl::matrix_base<double> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+
+ mat += alpha->value_double * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+ }
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cu
new file mode 100644
index 0000000..bc2c095
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2.cu
@@ -0,0 +1,231 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y)
+{
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+ viennacl::backend::mem_handle A_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+ viennacl::matrix_base<float> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ v2 *= beta->value_float;
+ if (A->trans == ViennaCLTrans)
+ v2 += alpha->value_float * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha->value_float * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+ viennacl::matrix_base<double> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ v2 *= beta->value_double;
+ if (A->trans == ViennaCLTrans)
+ v2 += alpha->value_double * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha->value_double * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo)
+{
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle A_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+
+ viennacl::matrix_base<float> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ if (A->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+ }
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+
+ viennacl::matrix_base<double> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ if (A->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A)
+{
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+ viennacl::backend::mem_handle A_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+ viennacl::matrix_base<float> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+
+ mat += alpha->value_float * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+ }
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(v1_handle, size_type(x->size), size_type(x->offset), difference_type(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, size_type(y->size), size_type(y->offset), difference_type(y->inc));
+
+ viennacl::matrix_base<double> mat(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+
+ mat += alpha->value_double * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+ }
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_cuda.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_cuda.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_cuda.cu
new file mode 100644
index 0000000..60c9293
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_cuda.cu
@@ -0,0 +1,204 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemv(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float beta,
+ float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+ viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+ m, offA_row, incA_row, m,
+ n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemv(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double beta,
+ double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+ viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+ m, offA_row, incA_row, m,
+ n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAStrsv(ViennaCLBackend /*backend*/,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<float> v(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+ n, offA_row, incA_row, n,
+ n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADtrsv(ViennaCLBackend /*backend*/,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<double> v(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+ n, offA_row, incA_row, n,
+ n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASger(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+ viennacl::matrix_base<float> mat(A, viennacl::CUDA_MEMORY,
+ m, offA_row, incA_row, m,
+ n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADger(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, m, offy, incy);
+ viennacl::matrix_base<double> mat(A, viennacl::CUDA_MEMORY,
+ m, offA_row, incA_row, m,
+ n, offA_col, incA_col, lda, order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cpp
new file mode 100644
index 0000000..b1ef39a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cpp
@@ -0,0 +1,219 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float beta,
+ float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+ viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double beta,
+ double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+ viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend /*backend*/,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+ size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend /*backend*/,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+ size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+ viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+ viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cu
new file mode 100644
index 0000000..b1ef39a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_host.cu
@@ -0,0 +1,219 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float beta,
+ float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+ viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double beta,
+ double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+ viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend /*backend*/,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+ size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend /*backend*/,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+ size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+ viennacl::matrix_base<float> mat(A, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend /*backend*/,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(m), size_type(offy), difference_type(incy));
+ viennacl::matrix_base<double> mat(A, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cpp
new file mode 100644
index 0000000..20c4994
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cpp
@@ -0,0 +1,219 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ float beta,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ double beta,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
[07/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix.hpp
new file mode 100644
index 0000000..120f636
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix.hpp
@@ -0,0 +1,1193 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/scheduler/preset.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/device_specific/execution_handler.hpp"
+#include "viennacl/device_specific/builtin_database/matrix_product.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix.hpp
+ * @brief Runtime generation of OpenCL kernels for matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+/** @brief Enumeration for the scalar type in ambm-like operations */
+enum ambm_scalar_type
+{
+ VIENNACL_AMBM_NONE = 0, // matrix does not exist/contribute
+ VIENNACL_AMBM_CPU,
+ VIENNACL_AMBM_GPU
+};
+
+/** @brief Configuration struct for generating OpenCL kernels for linear combinations of matrices */
+struct ambm_config
+{
+ ambm_config() : with_stride_and_range(true), is_row_major(true), a(VIENNACL_AMBM_CPU), b(VIENNACL_AMBM_NONE) {}
+
+ bool with_stride_and_range;
+ bool is_row_major;
+ std::string assign_op;
+ ambm_scalar_type a;
+ ambm_scalar_type b;
+};
+
+
+// just returns the for-loop
+template <typename StringType>
+void generate_ambm_impl2(StringType & source, ambm_config const & cfg, bool mult_alpha, bool mult_beta)
+{
+ if (cfg.is_row_major)
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+ source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+ }
+ else
+ {
+ source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+ source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+ }
+
+ if (cfg.with_stride_and_range)
+ {
+ if (cfg.is_row_major)
+ source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] ");
+ else
+ source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] ");
+ source.append(cfg.assign_op);
+ if (cfg.is_row_major)
+ source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] ");
+ else
+ source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] ");
+
+ if (mult_alpha)
+ source.append("* alpha ");
+ else
+ source.append("/ alpha ");
+ if (cfg.b != VIENNACL_AMBM_NONE)
+ {
+ if (cfg.is_row_major)
+ source.append("+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] ");
+ else
+ source.append("+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] ");
+ if (mult_beta)
+ source.append("* beta");
+ else
+ source.append("/ beta");
+ }
+ }
+ else
+ {
+ if (cfg.is_row_major)
+ source.append(" A[row * A_internal_size2 + col] ");
+ else
+ source.append(" A[row + col * A_internal_size1] ");
+ source.append(cfg.assign_op);
+ if (cfg.is_row_major)
+ source.append(" B[row * B_internal_size2 + col] ");
+ else
+ source.append(" B[row + col * B_internal_size1] ");
+
+ if (mult_alpha)
+ source.append("* alpha ");
+ else
+ source.append("/ alpha ");
+ if (cfg.b != VIENNACL_AMBM_NONE)
+ {
+ if (cfg.is_row_major)
+ source.append("+ C[row * C_internal_size2 + col] ");
+ else
+ source.append("+ C[row + col * C_internal_size2] ");
+ if (mult_beta)
+ source.append("* beta");
+ else
+ source.append("/ beta");
+ }
+ }
+ source.append("; \n");
+}
+
+template <typename StringType>
+void generate_ambm_impl(StringType & source, std::string const & numeric_string, ambm_config const & cfg)
+{
+ source.append("__kernel void am");
+ if (cfg.b != VIENNACL_AMBM_NONE)
+ source.append("bm");
+ if (cfg.assign_op != "=")
+ source.append("_m");
+
+ if (cfg.a == VIENNACL_AMBM_CPU)
+ source.append("_cpu");
+ else if (cfg.a == VIENNACL_AMBM_GPU)
+ source.append("_gpu");
+
+ if (cfg.b == VIENNACL_AMBM_CPU)
+ source.append("_cpu");
+ else if (cfg.b == VIENNACL_AMBM_GPU)
+ source.append("_gpu");
+ source.append("( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+ if (cfg.a == VIENNACL_AMBM_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" fac2, \n");
+ }
+ else if (cfg.a == VIENNACL_AMBM_GPU)
+ {
+ source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n");
+ }
+ source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+ source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n");
+ source.append(" unsigned int B_start1, unsigned int B_start2, \n");
+ source.append(" unsigned int B_inc1, unsigned int B_inc2, \n");
+ source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2");
+
+ if (cfg.b != VIENNACL_AMBM_NONE)
+ {
+ source.append(", \n\n");
+ if (cfg.b == VIENNACL_AMBM_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" fac3, \n");
+ }
+ else if (cfg.b == VIENNACL_AMBM_GPU)
+ {
+ source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n");
+ }
+ source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+ source.append(" __global const "); source.append(numeric_string); source.append(" * C, \n");
+ source.append(" unsigned int C_start1, unsigned int C_start2, \n");
+ source.append(" unsigned int C_inc1, unsigned int C_inc2, \n");
+ source.append(" unsigned int C_internal_size1, unsigned int C_internal_size2 \n");
+ }
+ source.append(") { \n");
+
+ if (cfg.a == VIENNACL_AMBM_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+ }
+ else if (cfg.a == VIENNACL_AMBM_GPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+ }
+ source.append(" if (options2 & (1 << 0)) \n");
+ source.append(" alpha = -alpha; \n");
+ source.append(" \n");
+
+ if (cfg.b == VIENNACL_AMBM_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n");
+ }
+ else if (cfg.b == VIENNACL_AMBM_GPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+ }
+ if (cfg.b != VIENNACL_AMBM_NONE)
+ {
+ source.append(" if (options3 & (1 << 0)) \n");
+ source.append(" beta = -beta; \n");
+ source.append(" \n");
+ }
+ source.append(" if (options2 & (1 << 1)) { \n");
+ if (cfg.b != VIENNACL_AMBM_NONE)
+ {
+ source.append(" if (options3 & (1 << 1)) {\n");
+ generate_ambm_impl2(source, cfg, false, false);
+ source.append(" } else {\n");
+ generate_ambm_impl2(source, cfg, false, true);
+ source.append(" } \n");
+ }
+ else
+ generate_ambm_impl2(source, cfg, false, true);
+ source.append(" } else { \n");
+ if (cfg.b != VIENNACL_AMBM_NONE)
+ {
+ source.append(" if (options3 & (1 << 1)) {\n");
+ generate_ambm_impl2(source, cfg, true, false);
+ source.append(" } else {\n");
+ generate_ambm_impl2(source, cfg, true, true);
+ source.append(" } \n");
+ }
+ else
+ generate_ambm_impl2(source, cfg, true, true);
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_ambm(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ ambm_config cfg;
+ cfg.assign_op = "=";
+ cfg.with_stride_and_range = true;
+ cfg.is_row_major = is_row_major;
+
+ // am
+ cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+ cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+
+ // ambm
+ cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+
+ // ambm_m
+ cfg.assign_op = "+=";
+
+ cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
+}
+
+template <typename StringType>
+void generate_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void assign_cpu( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha) \n");
+ source.append("{ \n");
+ if (is_row_major)
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+ source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+ source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n");
+ }
+ else
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+ source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+ source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha; \n");
+ }
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_diagonal_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void diagonal_assign_cpu( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n");
+ if (is_row_major)
+ source.append(" A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n");
+ else
+ source.append(" A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) * A_internal_size1] = alpha; \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_element_op(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void element_op( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * B, \n");
+ source.append(" unsigned int B_start1, unsigned int B_start2, \n");
+ source.append(" unsigned int B_inc1, unsigned int B_inc2, \n");
+ source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * C, \n");
+ source.append(" unsigned int C_start1, unsigned int C_start2, \n");
+ source.append(" unsigned int C_inc1, unsigned int C_inc2, \n");
+ source.append(" unsigned int C_internal_size1, unsigned int C_internal_size2, \n");
+ source.append(" unsigned int op_type) \n"); //0: product, 1: division, 2: pow
+ source.append("{ \n");
+ if (is_row_major)
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
+ source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
+ source.append(" if (op_type == 2) {");
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+ source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+ source.append(" pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n");
+ source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n");
+ }
+ source.append(" } else if (op_type == 1) {");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+ source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+ source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n");
+ source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
+ source.append(" } else if (op_type == 0) {");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
+ source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
+ source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n");
+ source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
+ source.append(" }");
+ }
+ else
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
+ source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
+ source.append(" if (op_type == 2) {");
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+ source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
+ source.append(" pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1], \n");
+ source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]); \n");
+ }
+ source.append(" } else if (op_type == 1) {");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+ source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
+ source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / \n");
+ source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n");
+ source.append(" } else if (op_type == 0) {");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
+ source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n");
+ source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * \n");
+ source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n");
+ source.append(" }");
+ }
+ source.append("} \n");
+}
+
+
+template<typename StringT>
+void generate_fft(StringT & source, std::string const & numeric_string, bool is_row_major)
+{
+ // naive fourier transform (quadratic complexity, use for reference only)
+ source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int stride, \n");
+ source.append(" unsigned int batch_num, \n");
+ source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
+ source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+ source.append(" \n");
+ source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+ source.append(" for (unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
+ source.append(" \n");
+ source.append(" for (unsigned int n = 0; n < size; n++) { \n");
+ source.append(" "); source.append(numeric_string); source.append("2 in = ");
+ if (is_row_major)
+ source.append("input[batch_id * stride + n]; \n"); //input index here
+ else
+ source.append("input[n * stride + batch_id]; \n"); //input index here
+ source.append(" \n");
+ source.append(" "); source.append(numeric_string); source.append(" sn, cs; \n");
+ source.append(" "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
+ source.append(" sn = sincos(arg, &cs); \n");
+ source.append(" \n");
+ source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+ source.append(" f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
+ source.append(" } \n");
+ source.append(" \n");
+ if (is_row_major)
+ source.append(" output[batch_id * stride + k] = f; \n"); // output index here
+ else
+ source.append(" output[k * stride + batch_id] = f; \n"); // output index here
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append(" \n"); //////////////////////////////
+
+ source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
+ source.append(" unsigned int s, \n");
+ source.append(" unsigned int bit_size, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int stride, \n");
+ source.append(" unsigned int batch_num, \n");
+ source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
+ source.append(" \n");
+ source.append(" unsigned int ss = 1 << s; \n");
+ source.append(" unsigned int half_size = size >> 1; \n");
+ source.append(" \n");
+ source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
+ source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+ source.append(" \n");
+ source.append(" unsigned int glb_id = get_global_id(0); \n");
+ source.append(" unsigned int glb_sz = get_global_size(0); \n");
+
+ source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+ source.append(" for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
+ source.append(" unsigned int group = (tid & (ss - 1)); \n");
+ source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
+
+ if (is_row_major)
+ {
+ source.append(" unsigned int offset = batch_id * stride + pos; \n");
+ source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
+ source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
+ }
+ else
+ {
+ source.append(" unsigned int offset = pos * stride + batch_id; \n");
+ source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
+ source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
+ }
+
+ source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
+
+ source.append(" sn = sincos(arg, &cs); \n");
+
+ source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+
+ source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
+
+ if (is_row_major)
+ source.append(" input[offset + ss] = in1 - tmp; \n");//index
+ else
+ source.append(" input[offset + ss * stride] = in1 - tmp; \n");//index
+ source.append(" input[offset] = in1 + tmp; \n");//index
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append(" \n"); //////////////////////////////
+
+ source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
+ source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
+ source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
+ source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
+ source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
+ source.append(" v = (v >> 16) | (v << 16); \n");
+ source.append(" \n");
+ source.append(" v = v >> (32 - bit_size); \n");
+ source.append(" \n");
+ source.append(" return v; \n");
+ source.append(" } \n");
+
+ source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
+ source.append(" __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
+ source.append(" unsigned int bit_size, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int stride, \n");
+ source.append(" unsigned int batch_num, \n");
+ source.append(" "); source.append(numeric_string); source.append(" sign) { \n");
+
+ source.append(" unsigned int grp_id = get_group_id(0); \n");
+ source.append(" unsigned int grp_num = get_num_groups(0); \n");
+
+ source.append(" unsigned int lcl_sz = get_local_size(0); \n");
+ source.append(" unsigned int lcl_id = get_local_id(0); \n");
+ source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
+
+ source.append(" for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
+ //unsigned int base_offset = stride * batch_id; \n");
+ //copy chunk of global memory to local \n");
+ source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
+ source.append(" unsigned int v = get_reorder_num(p, bit_size); \n");
+ if (is_row_major)
+ source.append(" lcl_input[v] = input[batch_id * stride + p]; \n"); //index
+ else
+ source.append(" lcl_input[v] = input[p * stride + batch_id]; \n"); //index
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ //performs Cooley-Tukey FFT on local array
+ source.append(" for (unsigned int s = 0; s < bit_size; s++) { \n");
+ source.append(" unsigned int ss = 1 << s; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n");
+
+ source.append(" for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
+ source.append(" unsigned int group = (tid & (ss - 1)); \n");
+ source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
+
+ source.append(" "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
+ source.append(" "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
+
+ source.append(" sn = sincos(arg, &cs); \n");
+ source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
+
+ source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
+
+ source.append(" lcl_input[pos + ss] = in1 - tmp; \n");
+ source.append(" lcl_input[pos] = in1 + tmp; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ //copy local array back to global memory
+ source.append(" for (unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
+ if (is_row_major)
+ source.append(" input[batch_id * stride + p] = lcl_input[p]; \n");//index
+ else
+ source.append(" input[p * stride + batch_id] = lcl_input[p]; \n");//index
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" \n"); //////////////////////////////
+
+ //
+ // Performs reordering of input data in bit-reversal order
+ // Probably it's better to do in host side,
+ //
+ source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
+ source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
+ source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
+ source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
+ source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
+ source.append(" v = (v >> 16) | (v << 16); \n");
+
+ source.append(" v = v >> (32 - bit_size); \n");
+
+ source.append(" return v; \n");
+ source.append("} \n");
+
+ source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
+ source.append(" unsigned int bit_size, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int stride, \n");
+ source.append(" int batch_num) { \n");
+
+ source.append(" unsigned int glb_id = get_global_id(0); \n");
+ source.append(" unsigned int glb_sz = get_global_size(0); \n");
+
+ source.append(" for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
+ source.append(" for (unsigned int i = glb_id; i < size; i += glb_sz) { \n");
+ source.append(" unsigned int v = get_reorder_num_2(i, bit_size); \n");
+
+ source.append(" if (i < v) {\n");
+ if (is_row_major)
+ {
+ source.append(" "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
+ source.append(" input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
+ source.append(" input[batch_id * stride + v] = tmp; \n"); //index
+ }
+ else
+ {
+ source.append(" "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
+ source.append(" input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
+ source.append(" input[v * stride + batch_id] = tmp; \n"); //index
+ }
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_lu(StringT & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void lu_factorize( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * matrix, \n");
+ source.append(" unsigned int matrix_rows, \n");
+ source.append(" unsigned int matrix_cols, \n");
+ source.append(" unsigned int matrix_internal_rows, \n");
+ source.append(" unsigned int matrix_internal_cols) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" temp; \n");
+
+ if (is_row_major)
+ {
+ source.append(" unsigned rowi; \n");
+ source.append(" unsigned rowk; \n");
+ source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
+ source.append(" { \n");
+ source.append(" rowi = i * matrix_internal_cols; \n");
+ source.append(" for (unsigned int k=0; k<i; ++k) \n");
+ source.append(" { \n");
+ source.append(" rowk = k * matrix_internal_cols; \n");
+ source.append(" if (get_global_id(0) == 0) \n");
+ source.append(" matrix[rowi + k] /= matrix[rowk + k]; \n");
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" temp = matrix[rowi + k]; \n");
+
+ //parallel subtraction:
+ source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
+ source.append(" matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
+ }
+ else
+ {
+ source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n");
+ source.append(" { \n");
+ source.append(" for (unsigned int k=0; k<i; ++k) \n");
+ source.append(" { \n");
+
+ source.append(" if (get_global_id(0) == 0) \n");
+ source.append(" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" temp = matrix[i + k*matrix_internal_rows]; \n");
+
+ //parallel subtraction:
+ source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
+ source.append(" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
+ }
+ source.append(" }");
+ source.append(" }");
+ source.append("}");
+}
+
+
+template<typename StringT>
+void generate_scaled_rank1_update(StringT & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
+{
+ source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+
+ if (alpha_on_cpu) {
+ source.append(" "); source.append(numeric_string); source.append(" val, \n");
+ } else {
+ source.append(" __global const "); source.append(numeric_string); source.append(" *val, \n");
+ }
+ source.append(" unsigned int options2, \n");
+
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec1, \n");
+ source.append(" unsigned int start1, \n");
+ source.append(" unsigned int inc1, \n");
+ source.append(" unsigned int size1, \n");
+
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n");
+ source.append(" unsigned int start2, \n");
+ source.append(" unsigned int inc2, \n");
+ source.append(" unsigned int size2) \n");
+ source.append("{ \n");
+
+ if (alpha_on_cpu) {
+ source.append(" "); source.append(numeric_string); source.append(" alpha = val; \n");
+ } else {
+ source.append(" "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
+ }
+ source.append(" if (options2 & (1 << 0)) \n");
+ source.append(" alpha = -alpha; \n");
+
+ source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+ source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
+ source.append(" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
+ if (is_row_major)
+ source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
+ else
+ source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_trans_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void trans_vec_mul( \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_row_start, unsigned int A_col_start, \n");
+ source.append(" unsigned int A_row_inc, unsigned int A_col_inc, \n");
+ source.append(" unsigned int A_row_size, unsigned int A_col_size, \n");
+ source.append(" unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * v, \n");
+ source.append(" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * work) \n");
+ source.append("{ \n");
+ if (is_row_major)
+ {
+ source.append(" for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" for (unsigned int col = 0; col < A_row_size; ++col) \n");
+ source.append(" dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n");
+ source.append(" result[row * result_inc + result_start] = dot_prod; \n");
+ }
+ else
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+ source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+ source.append(" unsigned int lid = get_local_id(0); \n");
+
+ source.append(" for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n");
+ source.append(" dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n");
+ source.append(" work[lid] = dot_prod; \n");
+
+ source.append(" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if(lid < stride) \n");
+ source.append(" work[lid] += work[lid+stride]; \n");
+ source.append(" } \n");
+
+ source.append(" if(lid == 0) \n");
+ source.append(" result[row * result_inc + result_start] = work[0]; \n");
+ }
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_triangular_substitute_inplace(StringT & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void triangular_substitute_inplace( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * v, \n");
+ source.append(" unsigned int v_start, \n");
+ source.append(" unsigned int v_inc, \n");
+ source.append(" unsigned int v_size, \n");
+ source.append(" unsigned int options) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" temp; \n");
+ source.append(" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n");
+ source.append(" unsigned int transposed_access_A = (options & (1 << 1)); \n");
+ source.append(" unsigned int is_lower_solve = (options & (1 << 2)); \n");
+ source.append(" unsigned int row; \n");
+ source.append(" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n"); //Note: A required to be square
+ source.append(" { \n");
+ source.append(" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" if (!unit_diagonal_flag) \n");
+ source.append(" { \n");
+ source.append(" if (get_global_id(0) == 0) \n");
+ if (is_row_major)
+ source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+ else
+ source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+ source.append(" temp = v[row * v_inc + v_start]; \n");
+
+ source.append(" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
+ source.append(" elim < (is_lower_solve ? A_size1 : row); \n");
+ source.append(" elim += get_global_size(0)) \n");
+ if (is_row_major)
+ {
+ source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
+ source.append(" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n");
+ }
+ else
+ {
+ source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
+ source.append(" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n");
+ }
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template <typename StringT>
+void generate_trans_kernel(StringT & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void trans_kernel(\n");
+ source.append(" __global const ");source.append(numeric_string);source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_stride1, unsigned int A_stride2, \n");
+ source.append(" __global ");source.append(numeric_string);source.append(" * B, \n");
+ source.append(" unsigned int B_start1, unsigned int B_start2, \n");
+ source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n");
+ source.append(" unsigned int B_stride1, unsigned int B_stride2) \n");
+ source.append("{ \n");
+ source.append(" for(unsigned int row = get_group_id(0); row < A_size1; row += get_num_groups(0))\n");
+ source.append(" { \n");
+ source.append(" for(unsigned int col = get_local_id(0); col < A_size2; col += get_local_size(0))\n");
+ source.append(" { \n");
+ if(is_row_major)
+ source.append(" B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; \n");
+ else
+ source.append(" B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
+{
+ source.append("__kernel void vec_mul( \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_row_start, unsigned int A_col_start, \n");
+ source.append(" unsigned int A_row_inc, unsigned int A_col_inc, \n");
+ source.append(" unsigned int A_row_size, unsigned int A_col_size, \n");
+ source.append(" unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * v, \n");
+ source.append(" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * work) \n");
+ source.append("{ \n");
+ if (is_row_major)
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+ source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+ source.append(" unsigned int lid = get_local_id(0); \n");
+
+ source.append(" for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n");
+ source.append(" dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n");
+ source.append(" work[lid] = dot_prod; \n");
+
+ source.append(" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if(lid < stride) \n");
+ source.append(" work[lid] += work[lid+stride]; \n");
+ source.append(" } \n");
+
+ source.append(" if(lid == 0) \n");
+ source.append(" result[row * result_inc + result_start] = work[0]; \n");
+
+ }
+ else
+ {
+ source.append(" for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" for (unsigned int col = 0; col < A_col_size; ++col) \n");
+ source.append(" dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n");
+ source.append(" result[row * result_inc + result_start] = dot_prod; \n");
+ }
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+namespace detail
+{
+ inline std::string type_to_string(viennacl::row_major) { return "row"; }
+ inline std::string type_to_string(viennacl::column_major) { return "col"; }
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type viennacl::matrix<>. */
+template <typename NumericT, typename F>
+struct matrix
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_" + detail::type_to_string(F());
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+ bool is_row_major = viennacl::is_row_major<F>::value;
+
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // fully parametrized kernels:
+ generate_ambm(source, numeric_string, is_row_major);
+
+ // kernels with mostly predetermined skeleton:
+ generate_assign_cpu(source, numeric_string, is_row_major);
+ generate_diagonal_assign_cpu(source, numeric_string, is_row_major);
+ generate_element_op(source, numeric_string, is_row_major);
+ generate_trans_vec_mul(source, numeric_string, is_row_major);
+ generate_vec_mul(source, numeric_string, is_row_major);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+/** @brief Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without involving matrices, multiple inner products, or element-wise operations other than addition or subtraction. */
+template<typename NumericT>
+class matrix_prod
+{
+public:
+ static device_specific::execution_handler & execution_handler(bool is_row_major, viennacl::ocl::context & ctx)
+ {
+ static std::map<std::pair<bool, cl_context>, device_specific::execution_handler> handlers_map;
+ cl_context h = ctx.handle().get();
+ std::pair<bool, cl_context> key(is_row_major, h);
+ if (handlers_map.find(key) == handlers_map.end())
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+
+ namespace ds = viennacl::device_specific;
+ viennacl::ocl::device const & device = ctx.current_device();
+ std::string program_name = viennacl::ocl::type_to_string<NumericT>::apply() + (is_row_major?"_matrix_prod_row":"_matrix_prod_col");
+ handlers_map.insert(std::make_pair(key, ds::execution_handler(program_name, ctx, device)));
+ ds::execution_handler & handler = viennacl::device_specific::at(handlers_map, key);
+
+ ds::matrix_product_template::parameters_type matrix_product_params_NN = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'N');
+ ds::matrix_product_template::parameters_type matrix_product_params_TN = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'N');
+ ds::matrix_product_template::parameters_type matrix_product_params_NT = ds::builtin_database::matrix_product_params<NumericT>(device, 'N', 'T');
+ ds::matrix_product_template::parameters_type matrix_product_params_TT = ds::builtin_database::matrix_product_params<NumericT>(device, 'T', 'T');
+
+ tools::shared_ptr<viennacl::matrix_base<NumericT> > pC;
+ if (is_row_major)
+ pC.reset(new viennacl::matrix<NumericT, viennacl::row_major>());
+ else
+ pC.reset(new viennacl::matrix<NumericT, viennacl::column_major>());
+
+ //Dummy types. The values don't matter for the kernel generation.
+ viennacl::matrix_base<NumericT>& C = *pC;
+ viennacl::matrix<NumericT, viennacl::column_major> A;
+ viennacl::matrix<NumericT, viennacl::column_major> B;
+ NumericT alpha = 1;
+ NumericT beta = 0;
+
+ handler.add("prod_NN", ds::matrix_product_template(matrix_product_params_NN, 'N', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, false, beta, &C));
+ handler.add("prod_TN", ds::matrix_product_template(matrix_product_params_TN, 'T', 'N'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, false, beta, &C));
+ handler.add("prod_NT", ds::matrix_product_template(matrix_product_params_NT, 'N', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, false, &B, true, beta, &C));
+ handler.add("prod_TT", ds::matrix_product_template(matrix_product_params_TT, 'T', 'T'), scheduler::preset::mat_mat_prod(alpha, &A, true, &B, true, beta, &C));
+
+ }
+ return viennacl::device_specific::at(handlers_map, key);
+ }
+};
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for operations on/with dense matrix objects of type viennacl::matrix<>. */
+template<typename NumericT, typename LayoutT>
+struct matrix_legacy
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_legacy_" + detail::type_to_string(LayoutT());
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+ bool is_row_major = viennacl::is_row_major<LayoutT>::value;
+
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // kernels with mostly predetermined skeleton:
+ generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
+ generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
+
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ generate_fft(source, numeric_string, is_row_major);
+ generate_lu(source, numeric_string, is_row_major);
+ generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
+ generate_trans_kernel(source, numeric_string, is_row_major);
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+
+
+
+template<typename StringT>
+void generate_matrix_convert_row(StringT & source, std::string const & dest_type, std::string const & src_type)
+{
+ source.append(" __kernel void convert_row_" + dest_type + "_" + src_type + "( \n");
+ source.append(" __global " + dest_type + " * dest, \n");
+ source.append(" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
+ source.append(" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
+ source.append(" __global const " + src_type + " * src, \n");
+ source.append(" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
+ source.append(" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
+ source.append(" { \n");
+ source.append(" for (unsigned int i = get_group_id(0); i < size1_dest; i += get_num_groups(0)) \n");
+ source.append(" for (unsigned int j = get_local_id(0); j < size2_dest; j += get_local_size(0)) \n");
+ source.append(" dest[(start1_dest + i * inc1_dest) * internal_size2_dest + (start2_dest + j * inc2_dest)] = src[(start1_src + i * inc1_src) * internal_size2_src + (start2_src + j * inc2_src)]; \n");
+ source.append(" } \n");
+}
+
+template<typename StringT>
+void generate_matrix_convert_col(StringT & source, std::string const & dest_type, std::string const & src_type)
+{
+ source.append(" __kernel void convert_col_" + dest_type + "_" + src_type + "( \n");
+ source.append(" __global " + dest_type + " * dest, \n");
+ source.append(" unsigned int start1_dest, unsigned int inc1_dest, unsigned int size1_dest, unsigned int internal_size1_dest, \n");
+ source.append(" unsigned int start2_dest, unsigned int inc2_dest, unsigned int size2_dest, unsigned int internal_size2_dest, \n");
+ source.append(" __global const " + src_type + " * src, \n");
+ source.append(" unsigned int start1_src, unsigned int inc1_src, unsigned int size1_src, unsigned int internal_size1_src, \n");
+ source.append(" unsigned int start2_src, unsigned int inc2_src, unsigned int size2_src, unsigned int internal_size2_src) \n");
+ source.append(" { \n");
+ source.append(" for (unsigned int j = get_group_id(0); j < size2_dest; j += get_num_groups(0)) \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < size1_dest; i += get_local_size(0)) \n");
+ source.append(" dest[(start1_dest + i * inc1_dest) + (start2_dest + j * inc2_dest) * internal_size1_dest] = src[(start1_src + i * inc1_src) + (start2_src + j * inc2_src) * internal_size1_src]; \n");
+ source.append(" } \n");
+}
+
+template<typename StringT>
+void generate_matrix_convert(StringT & source, std::string const & dest_type, std::string const & src_type)
+{
+ generate_matrix_convert_row(source, dest_type, src_type);
+ generate_matrix_convert_col(source, dest_type, src_type);
+}
+
+/** @brief Main kernel class for vector conversion routines (e.g. convert vector<int> to vector<float>). */
+struct matrix_convert
+{
+
+public:
+ static std::string program_name()
+ {
+ return "matrix_convert";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ std::string source;
+ source.reserve(4096);
+
+ // int
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ // unsigned int
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ // long
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ // unsigned long
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ // float
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<float>::apply());
+
+ if (ctx.current_device().double_support())
+ {
+ viennacl::ocl::append_double_precision_pragma<double>(ctx, source);
+
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<double>::apply());
+
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<float>::apply());
+ generate_matrix_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<double>::apply());
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+
+};
+
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_element.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_element.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_element.hpp
new file mode 100644
index 0000000..d3b684f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_element.hpp
@@ -0,0 +1,138 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_ELEMENT_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_ELEMENT_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix_element.hpp
+ * @brief OpenCL kernel file for element-wise matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+
+//generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D)
+template <typename StringType>
+void generate_matrix_unary_element_ops(StringType & source, std::string const & numeric_string,
+ std::string const & funcname, std::string const & op, std::string const & op_name, bool is_row_major)
+{
+ source.append("__kernel void "); source.append(funcname); source.append("_"); source.append(op_name); source.append("(\n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+
+ source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n");
+ source.append(" unsigned int B_start1, unsigned int B_start2, \n");
+ source.append(" unsigned int B_inc1, unsigned int B_inc2, \n");
+ source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2) { \n");
+
+ if (is_row_major)
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
+ source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
+
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
+ source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] \n");
+ source.append(" "); source.append(op); source.append(" "); source.append(funcname); source.append("(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); \n");
+ }
+ else
+ {
+ source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0); \n");
+ source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0); \n");
+
+ source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0)) \n");
+ source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0)) \n");
+ source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] \n");
+ source.append(" "); source.append(op); source.append(" "); source.append(funcname); source.append("(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); \n");
+ }
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_matrix_unary_element_ops(StringType & source, std::string const & numeric_string, std::string const & funcname, bool is_row_major)
+{
+ generate_matrix_unary_element_ops(source, numeric_string, funcname, "=", "assign", is_row_major);
+ //generate_matrix_unary_element_ops(source, numeric_string, funcname, "+=", "plus", is_row_major);
+ //generate_matrix_unary_element_ops(source, numeric_string, funcname, "-=", "minus", is_row_major);
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for elementwise-operations such as element_sin() on/with dense matrix objects of type viennacl::matrix<>. */
+template <typename NumericT, typename F>
+struct matrix_element
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_element_" + detail::type_to_string(F());
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ std::string source;
+ source.reserve(8192);
+ bool is_row_major = viennacl::is_row_major<F>::value;
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // unary operations
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ generate_matrix_unary_element_ops(source, numeric_string, "acos", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "asin", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "atan", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "ceil", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "cos", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "cosh", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "exp", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "fabs", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "floor", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "log", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "log10", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "sin", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "sinh", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "sqrt", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "tan", is_row_major);
+ generate_matrix_unary_element_ops(source, numeric_string, "tanh", is_row_major);
+ }
+ else
+ {
+ generate_matrix_unary_element_ops(source, numeric_string, "abs", is_row_major);
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_solve.hpp
new file mode 100644
index 0000000..f25a7a7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/matrix_solve.hpp
@@ -0,0 +1,180 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_SOLVE_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_SOLVE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/matrix_solve.hpp
+ * @brief OpenCL kernel file for dense matrix solves with multiple right hand side (BLAS level 3) */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+template<typename StringT>
+void generate_matrix_solve_blas3(StringT & source, std::string const & numeric_string,
+ bool row_major_A, bool row_major_B,
+ bool upper_solve, bool unit_diagonal)
+{
+ //start OpenCL code:
+ source.append("__kernel void ");
+ if (unit_diagonal)
+ source.append("unit_");
+ if (upper_solve)
+ source.append("upper_");
+ else
+ source.append("lower_");
+ source.append("solve");
+
+ source.append("( \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n");
+ source.append(" unsigned int A_start1, unsigned int A_start2, \n");
+ source.append(" unsigned int A_inc1, unsigned int A_inc2, \n");
+ source.append(" unsigned int A_size1, unsigned int A_size2, \n");
+ source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * B, \n");
+ source.append(" unsigned int B_start1, unsigned int B_start2, \n");
+ source.append(" unsigned int B_inc1, unsigned int B_inc2, \n");
+ source.append(" unsigned int B_size1, unsigned int B_size2, \n");
+ source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" temp; \n");
+ if (upper_solve)
+ {
+ //Note: A is square, thus A_rows == A_cols and no dispatch for transposedness needed
+ source.append(" for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row = A_size1 - 1 - row_cnt; \n");
+ }
+ else //lower triangular solve
+ {
+ source.append(" for (unsigned int row = 0; row < A_size1; ++row) \n");
+ source.append(" { \n");
+ }
+
+ if (!unit_diagonal)
+ {
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) == 0) \n");
+ //Note: A is square, thus A_internal_rows == A_internal_cols and no dispatch for transposedness needed
+ if (row_major_B)
+ source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)] /= ");
+ else
+ source.append(" B[(row * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1] /= ");
+
+ if (row_major_A)
+ source.append("A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+ else
+ source.append("A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; \n");
+ }
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+ if (row_major_B)
+ source.append(" temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)]; \n");
+ else
+ source.append(" temp = B[(row * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1]; \n");
+
+ source.append(" //eliminate column of op(A) with index 'row' in parallel: \n");
+ if (upper_solve)
+ source.append(" for (unsigned int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n");
+ else
+ source.append(" for (unsigned int elim = row + get_local_id(0) + 1; elim < A_size1; elim += get_local_size(0)) \n");
+
+ if (row_major_B)
+ source.append(" B[(elim * B_inc1 + B_start1) * B_internal_size2 + (get_group_id(0) * B_inc2 + B_start2)] -= temp * ");
+ else
+ source.append(" B[(elim * B_inc1 + B_start1) + (get_group_id(0) * B_inc2 + B_start2) * B_internal_size1] -= temp * ");
+
+ if (row_major_A)
+ source.append("A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
+ else
+ source.append("A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+// main kernel class
+/** @brief Main kernel class for the generation of matrix solve kernels.
+ *
+ * @param F1 Row/Column majority tag for the system matrix
+ * @param F2 Row/Column majority tag for the right hand side matrix
+ */
+template<typename NumericT, typename LayoutT1, typename LayoutT2>
+struct matrix_solve
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_solve_" + detail::type_to_string(LayoutT1()) + detail::type_to_string(LayoutT2());
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+ bool matrix_row_major = viennacl::is_row_major<LayoutT1>::value;
+ bool rhs_row_major = viennacl::is_row_major<LayoutT2>::value;
+
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // only generate for floating points (forces error for integers)
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+ false, false);
+ generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+ false, true);
+ generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+ true, false);
+ generate_matrix_solve_blas3(source, numeric_string, matrix_row_major, rhs_row_major,
+ true, true);
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/nmf.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/nmf.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/nmf.hpp
new file mode 100644
index 0000000..46cb419
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/nmf.hpp
@@ -0,0 +1,99 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_NMF_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_NMF_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/nmf.hpp
+ * @brief OpenCL kernel file for nonnegative matrix factorization */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+template<typename StringT>
+void generate_nmf_el_wise_mul_div(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void el_wise_mul_div( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * matrix1, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * matrix2, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * matrix3, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = matrix1[i] * matrix2[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" divisor = matrix3[i]; \n");
+ source.append(" matrix1[i] = (divisor > ("); source.append(numeric_string); source.append(")0.00001) ? (val / divisor) : ("); source.append(numeric_string); source.append(")0; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for nonnegative matrix factorization of a dense matrices. */
+template<typename NumericT>
+struct nmf
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_nmf";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // only generate for floating points (forces error for integers)
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ generate_nmf_el_wise_mul_div(source, numeric_string);
+ }
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
[03/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/sparse_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/sparse_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..a8d1557
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/sparse_matrix_operations.hpp
@@ -0,0 +1,1244 @@
+#ifndef VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/sparse_matrix_operations.hpp
+ @brief Implementations of operations using sparse matrices and OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/opencl/kernels/compressed_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/coordinate_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/ell_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/hyb_matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/vector_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+//
+// Compressed matrix
+//
+
+namespace detail
+{
+ template<typename NumericT, unsigned int AlignmentV>
+ void row_info(compressed_matrix<NumericT, AlignmentV> const & A,
+ vector_base<NumericT> & x,
+ viennacl::linalg::detail::row_info_types info_selector)
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & row_info_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "row_info_extractor");
+
+ viennacl::ocl::enqueue(row_info_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(A.size1()),
+ cl_uint(info_selector)
+ )
+ );
+ }
+}
+
+/** @brief Carries out matrix-vector multiplication with a compressed_matrix
+*
+* Implementation of the convenience expression y = prod(A, x);
+*
+* @param A The matrix
+* @param x The vector
+* @param y the result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & A,
+ const viennacl::vector_base<NumericT> & x,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & y,
+ NumericT beta)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+ bool use_nvidia_specific = AlignmentV == 1 && ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0);
+ bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
+
+
+ std::stringstream ss;
+ ss << "vec_mul";
+ unsigned int alignment = AlignmentV; //prevent unreachable code warnings below
+ if (use_nvidia_specific)
+ ss << "_nvidia";
+ else
+ {
+ if (alignment == 4)
+ ss << "4";
+ if (alignment == 8)
+ ss << "8";
+ }
+
+ if (with_alpha_beta)
+ ss << "_alpha_beta";
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), ss.str());
+
+ viennacl::ocl::packed_cl_uint layout_x;
+ layout_x.start = cl_uint(viennacl::traits::start(x));
+ layout_x.stride = cl_uint(viennacl::traits::stride(x));
+ layout_x.size = cl_uint(viennacl::traits::size(x));
+ layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
+
+ viennacl::ocl::packed_cl_uint layout_y;
+ layout_y.start = cl_uint(viennacl::traits::start(y));
+ layout_y.stride = cl_uint(viennacl::traits::stride(y));
+ layout_y.size = cl_uint(viennacl::traits::size(y));
+ layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
+
+ if (alignment == 4 || alignment == 8)
+ {
+ if (with_alpha_beta)
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ x, layout_x,
+ alpha,
+ y, layout_y,
+ beta
+ ));
+ else
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ x, layout_x,
+ y, layout_y
+ ));
+ }
+ else
+ {
+ if (ctx.current_device().max_work_group_size() >= 256)
+ k.local_work_size(0, 256);
+
+ if (use_nvidia_specific)
+ {
+ k.global_work_size(0, 512 * k.local_work_size(0));
+
+ if (with_alpha_beta)
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+ x, layout_x,
+ alpha,
+ y, layout_y,
+ beta
+ ));
+ else
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+ x, layout_x,
+ y, layout_y
+ ));
+ }
+ else // use CSR adaptive:
+ {
+ k.global_work_size(0, A.blocks1() * k.local_work_size(0));
+
+ if (with_alpha_beta)
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+ x, layout_x,
+ alpha,
+ y, layout_y,
+ beta
+ ));
+ else
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+ x, layout_x,
+ y, layout_y
+ ));
+ }
+ }
+}
+
+
+/** @brief Carries out sparse_matrix-matrix multiplication first matrix being compressed
+*
+* Implementation of the convenience expression y = prod(sp_A, d_A);
+*
+* @param sp_A The sparse matrix
+* @param d_A The dense matrix
+* @param y The y matrix
+*/
+template< typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_A,
+ const viennacl::matrix_base<NumericT> & d_A,
+ viennacl::matrix_base<NumericT> & y) {
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(),
+ detail::sparse_dense_matmult_kernel_name(false, d_A.row_major(), y.row_major()));
+
+ viennacl::ocl::enqueue(k(sp_A.handle1().opencl_handle(), sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(d_A),
+ cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
+ cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
+ cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
+ cl_uint(viennacl::traits::internal_size1(d_A)), cl_uint(viennacl::traits::internal_size2(d_A)),
+ viennacl::traits::opencl_handle(y),
+ cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
+ cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
+ cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
+ cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y)) ));
+}
+
+/** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+* and the second transposed
+*
+* Implementation of the convenience expression y = prod(sp_A, d_A);
+*
+* @param sp_A The sparse matrix
+* @param d_A The transposed dense matrix
+* @param y The y matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & sp_A,
+ viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > const & d_A,
+ viennacl::matrix_base<NumericT> & y) {
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(),
+ detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
+
+ viennacl::ocl::enqueue(k(sp_A.handle1().opencl_handle(), sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(d_A.lhs()),
+ cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
+ cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
+ cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
+ cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
+ viennacl::traits::opencl_handle(y),
+ cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
+ cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
+ cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
+ cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y)) ) );
+}
+
+/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices
+*
+* Implementation of the convenience expression C = prod(A, B);
+* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B
+*
+* @param A Left factor
+* @param B Right factor
+* @param C Result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ viennacl::compressed_matrix<NumericT, AlignmentV> const & B,
+ viennacl::compressed_matrix<NumericT, AlignmentV> & C)
+{
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+
+ /*
+ * Stage 1: Analyze sparsity pattern in order to properly allocate temporary arrays
+ *
+ * - Upper bound for the row lengths in C
+ */
+ viennacl::vector<unsigned int> upper_bound_nonzeros_per_row_A(256, ctx); // upper bound for the nonzeros per row encountered for each work group
+
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_stage1");
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+ viennacl::traits::opencl_handle(upper_bound_nonzeros_per_row_A)
+ ) );
+
+ upper_bound_nonzeros_per_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
+ unsigned int * upper_bound_nonzeros_per_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(upper_bound_nonzeros_per_row_A.handle());
+
+ unsigned int max_nnz_per_row_A = 0;
+ for (std::size_t i=0; i<upper_bound_nonzeros_per_row_A.size(); ++i)
+ max_nnz_per_row_A = std::max(max_nnz_per_row_A, upper_bound_nonzeros_per_row_A_ptr[i]);
+
+ if (max_nnz_per_row_A > 32)
+ {
+ // determine augmented size:
+ unsigned int max_entries_in_G = 32;
+ if (max_nnz_per_row_A <= 256)
+ max_entries_in_G = 16;
+ if (max_nnz_per_row_A <= 64)
+ max_entries_in_G = 8;
+
+ viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
+ viennacl::ocl::kernel & k_decompose_1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_decompose_1");
+ viennacl::ocl::enqueue(k_decompose_1(A.handle1().opencl_handle(), cl_uint(A.size1()),
+ cl_uint(max_entries_in_G),
+ viennacl::traits::opencl_handle(exclusive_scan_helper)
+ ) );
+
+ // exclusive scan of helper array to find new size:
+ viennacl::linalg::exclusive_scan(exclusive_scan_helper);
+ unsigned int augmented_size = exclusive_scan_helper[A.size1()];
+
+ // split A = A2 * G1
+ viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
+ viennacl::compressed_matrix<NumericT, AlignmentV> G1(augmented_size, A.size2(), A.nnz(), viennacl::traits::context(A));
+
+ // fill A2:
+ viennacl::ocl::kernel & k_fill_A2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_A2");
+ viennacl::ocl::enqueue(k_fill_A2(A2.handle1().opencl_handle(), A2.handle2().opencl_handle(), A2.handle().opencl_handle(), cl_uint(A2.size1()),
+ viennacl::traits::opencl_handle(exclusive_scan_helper)
+ ) );
+
+ // fill G1:
+ viennacl::ocl::kernel & k_fill_G1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_G1");
+ viennacl::ocl::enqueue(k_fill_G1(G1.handle1().opencl_handle(), G1.handle2().opencl_handle(), G1.handle().opencl_handle(), cl_uint(G1.size1()),
+ A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), cl_uint(A.nnz()),
+ cl_uint(max_entries_in_G),
+ viennacl::traits::opencl_handle(exclusive_scan_helper)
+ ) );
+
+ // compute tmp = G1 * B;
+ // C = A2 * tmp;
+ viennacl::compressed_matrix<NumericT, AlignmentV> tmp(G1.size1(), B.size2(), 0, viennacl::traits::context(A));
+ prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
+ prod_impl(A2, tmp, C); // this may split A2 again
+ return;
+ }
+
+
+ /*
+ * Stage 2: Determine sparsity pattern of C
+ */
+ C.resize(A.size1(), B.size2(), false);
+
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_stage2");
+ k2.local_work_size(0, 32); // run with one warp/wavefront
+ k2.global_work_size(0, 256*256*32); // make sure enough warps/wavefronts are in flight
+ viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+ B.handle1().opencl_handle(), B.handle2().opencl_handle(), cl_uint(B.size2()),
+ C.handle1().opencl_handle()
+ ) );
+
+ // exclusive scan on host to obtain row start indices:
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(C.handle1(), C.size1() + 1);
+ viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ unsigned int current_offset = 0;
+ for (std::size_t i=0; i<C.size1(); ++i)
+ {
+ unsigned int tmp = row_buffer[i];
+ row_buffer.set(i, current_offset);
+ current_offset += tmp;
+ }
+ row_buffer.set(C.size1(), current_offset);
+ viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+
+
+ /*
+ * Stage 3: Compute entries in C
+ */
+
+ C.reserve(current_offset, false);
+
+ viennacl::ocl::kernel & k3 = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "spgemm_stage3");
+ k3.local_work_size(0, 32); // run with one warp/wavefront
+ k3.global_work_size(0, 256*256*32); // make sure enough warps/wavefronts are in flight
+ viennacl::ocl::enqueue(k3(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
+ B.handle1().opencl_handle(), B.handle2().opencl_handle(), B.handle().opencl_handle(), cl_uint(B.size2()),
+ C.handle1().opencl_handle(), C.handle2().opencl_handle(), C.handle().opencl_handle()
+ ) );
+
+}
+
+// triangular solvers
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param L The matrix
+* @param x The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename NumericT, unsigned int MAT_AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, MAT_AlignmentV> const & L,
+ vector_base<NumericT> & x,
+ viennacl::linalg::unit_lower_tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "unit_lu_forward");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(L.size1())
+ )
+ );
+}
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param L The matrix
+* @param x The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & L,
+ vector_base<NumericT> & x,
+ viennacl::linalg::lower_tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "lu_forward");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(L.size1())
+ )
+ );
+}
+
+
+/** @brief Inplace solution of an upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param U The matrix
+* @param x The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,
+ vector_base<NumericT> & x,
+ viennacl::linalg::unit_upper_tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "unit_lu_backward");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(U.size1())
+ )
+ );
+}
+
+/** @brief Inplace solution of an upper triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param U The matrix
+* @param x The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,
+ vector_base<NumericT> & x,
+ viennacl::linalg::upper_tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "lu_backward");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(U.size1())
+ )
+ );
+}
+
+
+
+
+
+// transposed triangular solvers
+
+namespace detail
+{
+ //
+ // block solves
+ //
+ template<typename NumericT, unsigned int AlignmentV>
+ void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> & L,
+ viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+ vector_base<NumericT> const & /* L_diagonal */, //ignored
+ vector_base<NumericT> & x,
+ viennacl::linalg::unit_lower_tag)
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L.lhs()).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+ viennacl::ocl::kernel & block_solve_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "block_trans_unit_lu_forward");
+ block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
+
+ viennacl::ocl::enqueue(block_solve_kernel(L.lhs().handle1().opencl_handle(),
+ L.lhs().handle2().opencl_handle(),
+ L.lhs().handle().opencl_handle(),
+ block_indices.opencl_handle(),
+ x,
+ static_cast<cl_uint>(x.size())));
+ }
+
+
+ template<typename NumericT, unsigned int AlignmentV>
+ void block_inplace_solve(matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & U,
+ viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+ vector_base<NumericT> const & U_diagonal,
+ vector_base<NumericT> & x,
+ viennacl::linalg::upper_tag)
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U.lhs()).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+ viennacl::ocl::kernel & block_solve_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "block_trans_lu_backward");
+ block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
+
+ viennacl::ocl::enqueue(block_solve_kernel(U.lhs().handle1().opencl_handle(),
+ U.lhs().handle2().opencl_handle(),
+ U.lhs().handle().opencl_handle(),
+ U_diagonal,
+ block_indices.opencl_handle(),
+ x,
+ static_cast<cl_uint>(x.size())));
+ }
+
+
+}
+
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy_L The transposed matrix proxy
+* @param x The vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & proxy_L,
+ vector_base<NumericT> & x,
+ viennacl::linalg::unit_lower_tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "trans_unit_lu_forward");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(proxy_L.lhs().size1())
+ )
+ );
+}
+
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param proxy_L The transposed matrix proxy
+* @param x The vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & proxy_L,
+ vector_base<NumericT> & x,
+ viennacl::linalg::lower_tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+
+ viennacl::vector<NumericT> diagonal(x.size());
+ detail::row_info(proxy_L.lhs(), diagonal, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "trans_lu_forward");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
+ viennacl::traits::opencl_handle(diagonal),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(proxy_L.lhs().size1())
+ )
+ );
+}
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy_U The transposed matrix proxy
+* @param x The vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & proxy_U,
+ vector_base<NumericT> & x,
+ viennacl::linalg::unit_upper_tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "trans_unit_lu_backward");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(proxy_U.lhs().size1())
+ )
+ );
+}
+
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param proxy_U The transposed matrix proxy
+* @param x The vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & proxy_U,
+ vector_base<NumericT> & x,
+ viennacl::linalg::upper_tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::init(ctx);
+
+ viennacl::vector<NumericT> diagonal(x.size());
+ detail::row_info(proxy_U.lhs(), diagonal, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix_solve<NumericT>::program_name(), "trans_lu_backward");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
+ viennacl::traits::opencl_handle(diagonal),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(proxy_U.lhs().size1())
+ )
+ );
+}
+
+
+//
+// Compressed Compressed matrix
+//
+
+/** @brief Carries out matrix-vector multiplication with a compressed_compressed_matrix
+*
+* Implementation of the convenience expression y = prod(A, x);
+*
+* @param A The matrix
+* @param x The vector
+* @param y the result vector
+*/
+template<typename NumericT>
+void prod_impl(viennacl::compressed_compressed_matrix<NumericT> const & A,
+ viennacl::vector_base<NumericT> const & x,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & y,
+ NumericT beta)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::compressed_compressed_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_compressed_matrix<NumericT>::program_name(), "vec_mul");
+
+ if (beta < 0 || beta > 0) // multiply by beta
+ viennacl::linalg::opencl::av(y, y, beta, 1, false, false);
+ else
+ y.clear();
+
+ viennacl::ocl::packed_cl_uint layout_x;
+ layout_x.start = cl_uint(viennacl::traits::start(x));
+ layout_x.stride = cl_uint(viennacl::traits::stride(x));
+ layout_x.size = cl_uint(viennacl::traits::size(x));
+ layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
+
+ viennacl::ocl::packed_cl_uint layout_y;
+ layout_y.start = cl_uint(viennacl::traits::start(y));
+ layout_y.stride = cl_uint(viennacl::traits::stride(y));
+ layout_y.size = cl_uint(viennacl::traits::size(y));
+ layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
+
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle3().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.nnz1()),
+ x, layout_x,
+ alpha,
+ y, layout_y,
+ beta
+ ));
+}
+
+
+//
+// Coordinate matrix
+//
+
+namespace detail
+{
+ template<typename NumericT, unsigned int AlignmentV>
+ void row_info(coordinate_matrix<NumericT, AlignmentV> const & A,
+ vector_base<NumericT> & x,
+ viennacl::linalg::detail::row_info_types info_selector)
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & row_info_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(), "row_info_extractor");
+ unsigned int thread_num = 128; //k.local_work_size(0);
+
+ row_info_kernel.local_work_size(0, thread_num);
+
+ row_info_kernel.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+ viennacl::ocl::enqueue(row_info_kernel(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(info_selector),
+ viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+ viennacl::ocl::local_mem(sizeof(NumericT)*thread_num)) );
+ }
+}
+
+/** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+*
+* Implementation of the convenience expression y = prod(A, x);
+*
+* @param A The matrix
+* @param x The vector
+* @param y the result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::coordinate_matrix<NumericT, AlignmentV> const & A,
+ viennacl::vector_base<NumericT> const & x,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & y,
+ NumericT beta)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+
+ if (beta < 0 || beta > 0) // multiply by beta
+ viennacl::linalg::opencl::av(y, y, beta, 1, false, false);
+ else
+ y.clear();
+
+ viennacl::ocl::packed_cl_uint layout_x;
+ layout_x.start = cl_uint(viennacl::traits::start(x));
+ layout_x.stride = cl_uint(viennacl::traits::stride(x));
+ layout_x.size = cl_uint(viennacl::traits::size(x));
+ layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
+
+ viennacl::ocl::packed_cl_uint layout_y;
+ layout_y.start = cl_uint(viennacl::traits::start(y));
+ layout_y.stride = cl_uint(viennacl::traits::stride(y));
+ layout_y.size = cl_uint(viennacl::traits::size(y));
+ layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
+
+ //std::cout << "prod(coordinate_matrix" << AlignmentV << ", vector) called with internal_nnz=" << A.internal_nnz() << std::endl;
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(), "vec_mul");
+ unsigned int thread_num = 128; //k.local_work_size(0);
+
+ k.local_work_size(0, thread_num);
+
+ k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+ //k.global_work_size(0, thread_num); //Only one work group
+ viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ layout_x,
+ alpha,
+ viennacl::traits::opencl_handle(y),
+ layout_y,
+ beta,
+ viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+ viennacl::ocl::local_mem(sizeof(NumericT)*thread_num)) );
+
+}
+
+
+/** @brief Carries out sparse-matrix-dense-matrix multiplication, where the sparse matrix is a coordinate_matrix
+*
+* Implementation of the convenience expression y = prod(A, B); with A being sparse (COO) and B being dense
+*
+* @param A The sparse matrix (COO forA)
+* @param d_A The dense matrix
+* @param y the result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::coordinate_matrix<NumericT, AlignmentV> const & A,
+ viennacl::matrix_base<NumericT> const & d_A,
+ viennacl::matrix_base<NumericT> & y)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(),
+ detail::sparse_dense_matmult_kernel_name(false, d_A.row_major(), y.row_major()));
+
+ y.clear();
+
+ unsigned int thread_num = 128; //k.local_work_size(0);
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+ viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+ viennacl::traits::opencl_handle(d_A),
+ cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
+ cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
+ cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
+ cl_uint(viennacl::traits::internal_size1(d_A)), cl_uint(viennacl::traits::internal_size2(d_A)),
+ viennacl::traits::opencl_handle(y),
+ cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
+ cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
+ cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
+ cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y)),
+ viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
+ viennacl::ocl::local_mem(sizeof(NumericT)*k.local_work_size(0))) );
+
+}
+
+/** @brief Carries out sparse-matrix-dense-matrix multiplication, where the sparse matrix is a coordinate_matrix
+*
+* Implementation of the convenience expression y = prod(A, trans(B)); with A being sparse (COO) and B being dense
+*
+* @param A The sparse matrix (COO forA)
+* @param d_A The dense matrix
+* @param y the result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::coordinate_matrix<NumericT, AlignmentV> const & A,
+ viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > const & d_A,
+ viennacl::matrix_base<NumericT> & y)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(),
+ detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
+
+ y.clear();
+
+ unsigned int thread_num = 128; //k.local_work_size(0);
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+ viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+ viennacl::traits::opencl_handle(d_A),
+ cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
+ cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
+ cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
+ cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
+ viennacl::traits::opencl_handle(y),
+ cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
+ cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
+ cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
+ cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y)),
+ viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
+ viennacl::ocl::local_mem(sizeof(NumericT)*k.local_work_size(0))) );
+
+}
+
+
+//
+// ELL Matrix
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::ell_matrix<NumericT, AlignmentV> const & A,
+ viennacl::vector_base<NumericT> const & x,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & y,
+ NumericT beta)
+{
+ assert(A.size1() == y.size());
+ assert(A.size2() == x.size());
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::init(ctx);
+
+ bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
+
+ viennacl::ocl::packed_cl_uint layout_x;
+ layout_x.start = cl_uint(viennacl::traits::start(x));
+ layout_x.stride = cl_uint(viennacl::traits::stride(x));
+ layout_x.size = cl_uint(viennacl::traits::size(x));
+ layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
+
+ viennacl::ocl::packed_cl_uint layout_y;
+ layout_y.start = cl_uint(viennacl::traits::start(y));
+ layout_y.stride = cl_uint(viennacl::traits::stride(y));
+ layout_y.size = cl_uint(viennacl::traits::size(y));
+ layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
+
+ std::stringstream ss;
+ ss << "vec_mul_" << 1;//(AlignmentV != 1?4:1);
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
+
+ unsigned int thread_num = 128;
+ unsigned int group_num = 256;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ if (with_alpha_beta)
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ layout_x,
+ alpha,
+ viennacl::traits::opencl_handle(y),
+ layout_y,
+ beta,
+ cl_uint(A.size1()),
+ cl_uint(A.size2()),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.maxnnz()),
+ cl_uint(A.internal_maxnnz())
+ )
+ );
+ else
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ layout_x,
+ viennacl::traits::opencl_handle(y),
+ layout_y,
+ cl_uint(A.size1()),
+ cl_uint(A.size2()),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.maxnnz()),
+ cl_uint(A.internal_maxnnz())
+ )
+ );
+
+
+}
+
+/** @brief Carries out Sparse Matrix(ELL)-Dense Matrix multiplication
+*
+* Implementation of the convenience expression y = prod(sp_A, d_A);
+* sp_mat being in ELL format
+*
+* @param sp_A The sparse matrix (ELL)
+* @param d_A The dense matrix
+* @param y The y matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::ell_matrix<NumericT, AlignmentV> const & sp_A,
+ viennacl::matrix_base<NumericT> const & d_A,
+ viennacl::matrix_base<NumericT> & y) {
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
+ viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::program_name(),
+ detail::sparse_dense_matmult_kernel_name(false, d_A.row_major(), y.row_major()));
+
+ //unsigned int thread_num = 128;
+ //unsigned int group_num = 256;
+ //
+ //k.local_work_size(0, thread_num);
+ //k.global_work_size(0, thread_num * group_num);
+
+ viennacl::ocl::enqueue(k(sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
+ cl_uint(sp_A.size1()),
+ cl_uint(sp_A.size2()),
+ cl_uint(sp_A.internal_size1()),
+ cl_uint(sp_A.maxnnz()),
+ cl_uint(sp_A.internal_maxnnz()),
+ viennacl::traits::opencl_handle(d_A),
+ cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
+ cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
+ cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
+ cl_uint(viennacl::traits::internal_size1(d_A)), cl_uint(viennacl::traits::internal_size2(d_A)),
+ viennacl::traits::opencl_handle(y),
+ cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
+ cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
+ cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
+ cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y))
+ )
+ );
+}
+
+/** @brief Carries out Sparse Matrix(ELL)-Dense Transposed Matrix multiplication
+*
+* Implementation of the convenience expression y = prod(sp_A, trans(d_A));
+* sp_mat being in ELL format
+*
+* @param sp_A The sparse matrix (ELL)
+* @param d_A The dense transposed matrix
+* @param y The y matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::ell_matrix<NumericT, AlignmentV> const & sp_A,
+ viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > const & d_A,
+ viennacl::matrix_base<NumericT> & y) {
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_A).context());
+ viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<NumericT>::program_name(),
+ detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
+
+ //unsigned int thread_num = 128;
+ //unsigned int group_num = 256;
+ //
+ //k.local_work_size(0, thread_num);
+ //k.global_work_size(0, thread_num * group_num);
+
+ viennacl::ocl::enqueue(k(sp_A.handle2().opencl_handle(), sp_A.handle().opencl_handle(),
+ cl_uint(sp_A.size1()),
+ cl_uint(sp_A.size2()),
+ cl_uint(sp_A.internal_size1()),
+ cl_uint(sp_A.maxnnz()),
+ cl_uint(sp_A.internal_maxnnz()),
+ viennacl::traits::opencl_handle(d_A.lhs()),
+ cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
+ cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
+ cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
+ cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
+ viennacl::traits::opencl_handle(y),
+ cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
+ cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
+ cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
+ cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y))
+ )
+ );
+}
+
+//
+// SELL-C-\sigma Matrix
+//
+
+template<typename ScalarT, typename IndexT>
+void prod_impl(viennacl::sliced_ell_matrix<ScalarT, IndexT> const & A,
+ viennacl::vector_base<ScalarT> const & x,
+ ScalarT alpha,
+ viennacl::vector_base<ScalarT> & y,
+ ScalarT beta)
+{
+ assert(A.size1() == y.size());
+ assert(A.size2() == x.size());
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::sliced_ell_matrix<ScalarT, unsigned int>::init(ctx);
+
+ bool with_alpha_beta = (alpha < ScalarT(1) || alpha > ScalarT(1)) || (beta < 0 || beta > 0);
+
+ viennacl::ocl::packed_cl_uint layout_x;
+ layout_x.start = cl_uint(viennacl::traits::start(x));
+ layout_x.stride = cl_uint(viennacl::traits::stride(x));
+ layout_x.size = cl_uint(viennacl::traits::size(x));
+ layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
+
+ viennacl::ocl::packed_cl_uint layout_y;
+ layout_y.start = cl_uint(viennacl::traits::start(y));
+ layout_y.stride = cl_uint(viennacl::traits::stride(y));
+ layout_y.size = cl_uint(viennacl::traits::size(y));
+ layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
+
+ std::stringstream ss;
+ ss << "vec_mul_" << 1;//(AlignmentV != 1?4:1);
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::sliced_ell_matrix<ScalarT, IndexT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
+
+ vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
+ unsigned int group_num = 256;
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ thread_num = 256;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ if (with_alpha_beta)
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+ A.handle2().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ layout_x,
+ alpha,
+ viennacl::traits::opencl_handle(y),
+ layout_y,
+ beta,
+ cl_uint(A.rows_per_block()))
+ );
+ else
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+ A.handle2().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ layout_x,
+ viennacl::traits::opencl_handle(y),
+ layout_y,
+ cl_uint(A.rows_per_block()))
+ );
+}
+
+
+//
+// Hybrid Matrix
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::hyb_matrix<NumericT, AlignmentV> const & A,
+ viennacl::vector_base<NumericT> const & x,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & y,
+ NumericT beta)
+{
+ assert(A.size1() == y.size());
+ assert(A.size2() == x.size());
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
+
+ bool with_alpha_beta = (alpha < NumericT(1) || alpha > NumericT(1)) || (beta < 0 || beta > 0);
+
+ viennacl::ocl::packed_cl_uint layout_x;
+ layout_x.start = cl_uint(viennacl::traits::start(x));
+ layout_x.stride = cl_uint(viennacl::traits::stride(x));
+ layout_x.size = cl_uint(viennacl::traits::size(x));
+ layout_x.internal_size = cl_uint(viennacl::traits::internal_size(x));
+
+ viennacl::ocl::packed_cl_uint layout_y;
+ layout_y.start = cl_uint(viennacl::traits::start(y));
+ layout_y.stride = cl_uint(viennacl::traits::stride(y));
+ layout_y.size = cl_uint(viennacl::traits::size(y));
+ layout_y.internal_size = cl_uint(viennacl::traits::internal_size(y));
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(), with_alpha_beta ? "vec_mul_alpha_beta" : "vec_mul");
+
+ if (with_alpha_beta)
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle4().opencl_handle(),
+ A.handle5().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ layout_x,
+ alpha,
+ viennacl::traits::opencl_handle(y),
+ layout_y,
+ beta,
+ cl_uint(A.size1()),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.ell_nnz()),
+ cl_uint(A.internal_ellnnz())
+ )
+ );
+ else
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle4().opencl_handle(),
+ A.handle5().opencl_handle(),
+ viennacl::traits::opencl_handle(x),
+ layout_x,
+ viennacl::traits::opencl_handle(y),
+ layout_y,
+ cl_uint(A.size1()),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.ell_nnz()),
+ cl_uint(A.internal_ellnnz())
+ )
+ );
+}
+
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::hyb_matrix<NumericT, AlignmentV> const & A,
+ viennacl::matrix_base<NumericT> const & d_A,
+ viennacl::matrix_base<NumericT> & y)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(),
+ detail::sparse_dense_matmult_kernel_name(false, d_A.row_major(), y.row_major()));
+
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle4().opencl_handle(),
+ A.handle5().opencl_handle(),
+ cl_uint(A.size1()),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.ell_nnz()),
+ cl_uint(A.internal_ellnnz()),
+ viennacl::traits::opencl_handle(d_A),
+ cl_uint(viennacl::traits::start1(d_A)), cl_uint(viennacl::traits::start2(d_A)),
+ cl_uint(viennacl::traits::stride1(d_A)), cl_uint(viennacl::traits::stride2(d_A)),
+ cl_uint(viennacl::traits::size1(d_A)), cl_uint(viennacl::traits::size2(d_A)),
+ cl_uint(viennacl::traits::internal_size1(d_A)), cl_uint(viennacl::traits::internal_size2(d_A)),
+ viennacl::traits::opencl_handle(y),
+ cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
+ cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
+ cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
+ cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y))
+ )
+ );
+}
+
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::hyb_matrix<NumericT, AlignmentV> const & A,
+ viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > const & d_A,
+ viennacl::matrix_base<NumericT> & y)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(),
+ detail::sparse_dense_matmult_kernel_name(true, d_A.lhs().row_major(), y.row_major()));
+
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle4().opencl_handle(),
+ A.handle5().opencl_handle(),
+ cl_uint(A.size1()),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.ell_nnz()),
+ cl_uint(A.internal_ellnnz()),
+ viennacl::traits::opencl_handle(d_A.lhs()),
+ cl_uint(viennacl::traits::start1(d_A.lhs())), cl_uint(viennacl::traits::start2(d_A.lhs())),
+ cl_uint(viennacl::traits::stride1(d_A.lhs())), cl_uint(viennacl::traits::stride2(d_A.lhs())),
+ cl_uint(viennacl::traits::size1(d_A.lhs())), cl_uint(viennacl::traits::size2(d_A.lhs())),
+ cl_uint(viennacl::traits::internal_size1(d_A.lhs())), cl_uint(viennacl::traits::internal_size2(d_A.lhs())),
+ viennacl::traits::opencl_handle(y),
+ cl_uint(viennacl::traits::start1(y)), cl_uint(viennacl::traits::start2(y)),
+ cl_uint(viennacl::traits::stride1(y)), cl_uint(viennacl::traits::stride2(y)),
+ cl_uint(viennacl::traits::size1(y)), cl_uint(viennacl::traits::size2(y)),
+ cl_uint(viennacl::traits::internal_size1(y)), cl_uint(viennacl::traits::internal_size2(y))
+ )
+ );
+}
+
+
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
new file mode 100644
index 0000000..6a25d81
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
@@ -0,0 +1,68 @@
+#ifndef VIENNACL_LINALG_OPENCL_VANDERMONDE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_VANDERMONDE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/vandermonde_matrix_operations.hpp
+ @brief Implementations of operations using vandermonde_matrix
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/fft.hpp"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+/** @brief Carries out matrix-vector multiplication with a vandermonde_matrix
+*
+* Implementation of the convenience expression y = prod(A, x);
+*
+* @param A The Vandermonde matrix
+* @param x The vector
+* @param y The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::vandermonde_matrix<NumericT, AlignmentV> const & A,
+ viennacl::vector_base<NumericT> const & x,
+ viennacl::vector_base<NumericT> & y)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "vandermonde_prod");
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(A),
+ viennacl::traits::opencl_handle(x),
+ viennacl::traits::opencl_handle(y),
+ static_cast<cl_uint>(A.size1())));
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[02/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vector_operations.hpp
new file mode 100644
index 0000000..cd04482
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/vector_operations.hpp
@@ -0,0 +1,1263 @@
+#ifndef VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/vector_operations.hpp
+ @brief Implementations of vector operations using OpenCL
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/detail/vector_def.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/vector.hpp"
+#include "viennacl/linalg/opencl/kernels/vector_element.hpp"
+#include "viennacl/linalg/opencl/kernels/scan.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+ assert(viennacl::traits::opencl_handle(dest).context() == viennacl::traits::opencl_handle(src).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ std::string kernel_name("convert_");
+ kernel_name += viennacl::ocl::type_to_string<DestNumericT>::apply();
+ kernel_name += "_";
+ kernel_name += viennacl::ocl::type_to_string<SrcNumericT>::apply();
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(dest).context());
+ viennacl::linalg::opencl::kernels::vector_convert::init(ctx);
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_convert::program_name(), kernel_name);
+
+ viennacl::ocl::enqueue(k( dest, cl_uint(dest.start()), cl_uint(dest.stride()), cl_uint(dest.size()),
+ src, cl_uint( src.start()), cl_uint( src.stride())
+ ) );
+
+}
+
+template <typename T, typename ScalarType1>
+void av(vector_base<T> & vec1,
+ vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(),
+ (viennacl::is_cpu_scalar<ScalarType1>::value ? "av_cpu" : "av_gpu"));
+ k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+ viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+ viennacl::ocl::packed_cl_uint size_vec1;
+ size_vec1.start = cl_uint(viennacl::traits::start(vec1));
+ size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+ size_vec1.size = cl_uint(viennacl::traits::size(vec1));
+ size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
+
+ viennacl::ocl::packed_cl_uint size_vec2;
+ size_vec2.start = cl_uint(viennacl::traits::start(vec2));
+ size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+ size_vec2.size = cl_uint(viennacl::traits::size(vec2));
+ size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
+
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ size_vec1,
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(vec2),
+ size_vec2 )
+ );
+}
+
+
+template <typename T, typename ScalarType1, typename ScalarType2>
+void avbv(vector_base<T> & vec1,
+ vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+ assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ std::string kernel_name;
+ if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+ kernel_name = "avbv_cpu_cpu";
+ else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+ kernel_name = "avbv_cpu_gpu";
+ else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+ kernel_name = "avbv_gpu_cpu";
+ else
+ kernel_name = "avbv_gpu_gpu";
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), kernel_name);
+ k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+ viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+ viennacl::ocl::packed_cl_uint size_vec1;
+ size_vec1.start = cl_uint(viennacl::traits::start(vec1));
+ size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+ size_vec1.size = cl_uint(viennacl::traits::size(vec1));
+ size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
+
+ viennacl::ocl::packed_cl_uint size_vec2;
+ size_vec2.start = cl_uint(viennacl::traits::start(vec2));
+ size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+ size_vec2.size = cl_uint(viennacl::traits::size(vec2));
+ size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
+
+ viennacl::ocl::packed_cl_uint size_vec3;
+ size_vec3.start = cl_uint(viennacl::traits::start(vec3));
+ size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
+ size_vec3.size = cl_uint(viennacl::traits::size(vec3));
+ size_vec3.internal_size = cl_uint(viennacl::traits::internal_size(vec3));
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ size_vec1,
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(vec2),
+ size_vec2,
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
+ options_beta,
+ viennacl::traits::opencl_handle(vec3),
+ size_vec3 )
+ );
+}
+
+
+template <typename T, typename ScalarType1, typename ScalarType2>
+void avbv_v(vector_base<T> & vec1,
+ vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+ assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ std::string kernel_name;
+ if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+ kernel_name = "avbv_v_cpu_cpu";
+ else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
+ kernel_name = "avbv_v_cpu_gpu";
+ else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
+ kernel_name = "avbv_v_gpu_cpu";
+ else
+ kernel_name = "avbv_v_gpu_gpu";
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), kernel_name);
+ k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+ viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+ viennacl::ocl::packed_cl_uint size_vec1;
+ size_vec1.start = cl_uint(viennacl::traits::start(vec1));
+ size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+ size_vec1.size = cl_uint(viennacl::traits::size(vec1));
+ size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
+
+ viennacl::ocl::packed_cl_uint size_vec2;
+ size_vec2.start = cl_uint(viennacl::traits::start(vec2));
+ size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+ size_vec2.size = cl_uint(viennacl::traits::size(vec2));
+ size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
+
+ viennacl::ocl::packed_cl_uint size_vec3;
+ size_vec3.start = cl_uint(viennacl::traits::start(vec3));
+ size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
+ size_vec3.size = cl_uint(viennacl::traits::size(vec3));
+ size_vec3.internal_size = cl_uint(viennacl::traits::internal_size(vec3));
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ size_vec1,
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(vec2),
+ size_vec2,
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
+ options_beta,
+ viennacl::traits::opencl_handle(vec3),
+ size_vec3 )
+ );
+}
+
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1 The vector to which the value should be assigned
+* @param alpha The value to be assigned
+* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template <typename T>
+void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "assign_cpu");
+ k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
+ viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
+
+ cl_uint size = up_to_internal_size ? cl_uint(vec1.internal_size()) : cl_uint(viennacl::traits::size(vec1));
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ cl_uint(viennacl::traits::start(vec1)),
+ cl_uint(viennacl::traits::stride(vec1)),
+ size,
+ cl_uint(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+ viennacl::traits::opencl_handle(T(alpha)) )
+ );
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1 The first vector (or -range, or -slice)
+* @param vec2 The second vector (or -range, or -slice)
+*/
+template <typename T>
+void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "swap");
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ cl_uint(viennacl::traits::start(vec1)),
+ cl_uint(viennacl::traits::stride(vec1)),
+ cl_uint(viennacl::traits::size(vec1)),
+ viennacl::traits::opencl_handle(vec2),
+ cl_uint(viennacl::traits::start(vec2)),
+ cl_uint(viennacl::traits::stride(vec2)),
+ cl_uint(viennacl::traits::size(vec2)))
+ );
+}
+
+///////////////////////// Binary Elementwise operations /////////////
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax)
+*
+* @param vec1 The result vector (or -range, or -slice)
+* @param proxy The proxy object holding v2, v3 and the operation
+*/
+template <typename T, typename OP>
+void element_op(vector_base<T> & vec1,
+ vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector_element<T>::init(ctx);
+
+ std::string kernel_name = "element_pow";
+ cl_uint op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OP>::value)
+ {
+ op_type = 1;
+ kernel_name = "element_div";
+ }
+ else if (viennacl::is_product<OP>::value)
+ {
+ op_type = 0;
+ kernel_name = "element_prod";
+ }
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_element<T>::program_name(), kernel_name);
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ cl_uint(viennacl::traits::start(vec1)),
+ cl_uint(viennacl::traits::stride(vec1)),
+ cl_uint(viennacl::traits::size(vec1)),
+
+ viennacl::traits::opencl_handle(proxy.lhs()),
+ cl_uint(viennacl::traits::start(proxy.lhs())),
+ cl_uint(viennacl::traits::stride(proxy.lhs())),
+
+ viennacl::traits::opencl_handle(proxy.rhs()),
+ cl_uint(viennacl::traits::start(proxy.rhs())),
+ cl_uint(viennacl::traits::stride(proxy.rhs())),
+
+ op_type)
+ );
+}
+
+///////////////////////// Unary Elementwise operations /////////////
+
+/** @brief Implementation of unary element-wise operations v1 = OP(v2)
+*
+* @param vec1 The result vector (or -range, or -slice)
+* @param proxy The proxy object holding v2 and the operation
+*/
+template <typename T, typename OP>
+void element_op(vector_base<T> & vec1,
+ vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector_element<T>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_element<T>::program_name(), detail::op_to_string(OP()) + "_assign");
+
+ viennacl::ocl::packed_cl_uint size_vec1;
+ size_vec1.start = cl_uint(viennacl::traits::start(vec1));
+ size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+ size_vec1.size = cl_uint(viennacl::traits::size(vec1));
+ size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
+
+ viennacl::ocl::packed_cl_uint size_vec2;
+ size_vec2.start = cl_uint(viennacl::traits::start(proxy.lhs()));
+ size_vec2.stride = cl_uint(viennacl::traits::stride(proxy.lhs()));
+ size_vec2.size = cl_uint(viennacl::traits::size(proxy.lhs()));
+ size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(proxy.lhs()));
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ size_vec1,
+ viennacl::traits::opencl_handle(proxy.lhs()),
+ size_vec2)
+ );
+}
+
+///////////////////////// Norms and inner product ///////////////////
+
+/** @brief Computes the partial inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param partial_result The results of each group
+*/
+template <typename T>
+void inner_prod_impl(vector_base<T> const & vec1,
+ vector_base<T> const & vec2,
+ vector_base<T> & partial_result)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+ assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
+ && bool("Incompatible vector sizes in inner_prod_impl()!"));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "inner_prod1");
+
+ assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in inner_prod_impl()") );
+
+ viennacl::ocl::packed_cl_uint size_vec1;
+ size_vec1.start = cl_uint(viennacl::traits::start(vec1));
+ size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
+ size_vec1.size = cl_uint(viennacl::traits::size(vec1));
+ size_vec1.internal_size = cl_uint(viennacl::traits::internal_size(vec1));
+
+ viennacl::ocl::packed_cl_uint size_vec2;
+ size_vec2.start = cl_uint(viennacl::traits::start(vec2));
+ size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
+ size_vec2.size = cl_uint(viennacl::traits::size(vec2));
+ size_vec2.internal_size = cl_uint(viennacl::traits::internal_size(vec2));
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ size_vec1,
+ viennacl::traits::opencl_handle(vec2),
+ size_vec2,
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+ viennacl::traits::opencl_handle(partial_result)
+ )
+ );
+}
+
+
+//implementation of inner product:
+//namespace {
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template <typename T>
+void inner_prod_impl(vector_base<T> const & vec1,
+ vector_base<T> const & vec2,
+ scalar<T> & result)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
+ temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
+
+ // Step 1: Compute partial inner products for each work group:
+ inner_prod_impl(vec1, vec2, temp);
+
+ // Step 2: Sum partial results:
+ viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+ ksum.global_work_size(0, ksum.local_work_size(0));
+ viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(viennacl::traits::start(temp)),
+ cl_uint(viennacl::traits::stride(temp)),
+ cl_uint(viennacl::traits::size(temp)),
+ cl_uint(1),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+ viennacl::traits::opencl_handle(result) )
+ );
+}
+
+namespace detail
+{
+ template<typename NumericT>
+ viennacl::ocl::packed_cl_uint make_layout(vector_base<NumericT> const & vec)
+ {
+ viennacl::ocl::packed_cl_uint ret;
+ ret.start = cl_uint(viennacl::traits::start(vec));
+ ret.stride = cl_uint(viennacl::traits::stride(vec));
+ ret.size = cl_uint(viennacl::traits::size(vec));
+ ret.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ return ret;
+ }
+}
+
+/** @brief Computes multiple inner products where one argument is common to all inner products. <x, y1>, <x, y2>, ..., <x, yN>
+*
+* @param x The common vector
+* @param vec_tuple The tuple of vectors y1, y2, ..., yN
+* @param result The result vector
+*/
+template <typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+ vector_tuple<NumericT> const & vec_tuple,
+ vector_base<NumericT> & result)
+{
+ assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+ viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+ viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::init(ctx);
+
+ viennacl::ocl::packed_cl_uint layout_x = detail::make_layout(x);
+
+ viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "sum_inner_prod");
+ viennacl::ocl::kernel & inner_prod_kernel_1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "inner_prod1");
+ viennacl::ocl::kernel & inner_prod_kernel_2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "inner_prod2");
+ viennacl::ocl::kernel & inner_prod_kernel_3 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "inner_prod3");
+ viennacl::ocl::kernel & inner_prod_kernel_4 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "inner_prod4");
+ viennacl::ocl::kernel & inner_prod_kernel_8 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<NumericT>::program_name(), "inner_prod8");
+
+ vcl_size_t work_groups = inner_prod_kernel_8.global_work_size(0) / inner_prod_kernel_8.local_work_size(0);
+ viennacl::vector<NumericT> temp(8 * work_groups, viennacl::traits::context(x));
+
+ vcl_size_t current_index = 0;
+ while (current_index < vec_tuple.const_size())
+ {
+ switch (vec_tuple.const_size() - current_index)
+ {
+ case 7:
+ case 6:
+ case 5:
+ case 4:
+ {
+ vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
+ vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+ vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+ vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+ viennacl::ocl::enqueue(inner_prod_kernel_4( viennacl::traits::opencl_handle(x), layout_x,
+ viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+ viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+ viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+ viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 4 * inner_prod_kernel_4.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ) );
+
+ ksum.global_work_size(0, 4 * ksum.local_work_size(0));
+ viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(work_groups),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 4 * ksum.local_work_size()),
+ viennacl::traits::opencl_handle(result),
+ cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+ cl_uint(viennacl::traits::stride(result))
+ )
+ );
+ }
+ current_index += 4;
+ break;
+
+ case 3:
+ {
+ vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
+ vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+ vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+ viennacl::ocl::enqueue(inner_prod_kernel_3( viennacl::traits::opencl_handle(x), layout_x,
+ viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+ viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+ viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 3 * inner_prod_kernel_3.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ) );
+
+ ksum.global_work_size(0, 3 * ksum.local_work_size(0));
+ viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(work_groups),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 3 * ksum.local_work_size()),
+ viennacl::traits::opencl_handle(result),
+ cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+ cl_uint(viennacl::traits::stride(result))
+ )
+ );
+ }
+ current_index += 3;
+ break;
+
+ case 2:
+ {
+ vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
+ vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+ viennacl::ocl::enqueue(inner_prod_kernel_2( viennacl::traits::opencl_handle(x), layout_x,
+ viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+ viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 2 * inner_prod_kernel_2.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ) );
+
+ ksum.global_work_size(0, 2 * ksum.local_work_size(0));
+ viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(work_groups),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 2 * ksum.local_work_size()),
+ viennacl::traits::opencl_handle(result),
+ cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+ cl_uint(viennacl::traits::stride(result))
+ )
+ );
+ }
+ current_index += 2;
+ break;
+
+ case 1:
+ {
+ vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
+ viennacl::ocl::enqueue(inner_prod_kernel_1( viennacl::traits::opencl_handle(x), layout_x,
+ viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 1 * inner_prod_kernel_1.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ) );
+
+ ksum.global_work_size(0, 1 * ksum.local_work_size(0));
+ viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(work_groups),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 1 * ksum.local_work_size()),
+ viennacl::traits::opencl_handle(result),
+ cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+ cl_uint(viennacl::traits::stride(result))
+ )
+ );
+ }
+ current_index += 1;
+ break;
+
+ default: //8 or more vectors
+ {
+ vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index );
+ vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+ vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+ vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+ vector_base<NumericT> const & y4 = vec_tuple.const_at(current_index + 4);
+ vector_base<NumericT> const & y5 = vec_tuple.const_at(current_index + 5);
+ vector_base<NumericT> const & y6 = vec_tuple.const_at(current_index + 6);
+ vector_base<NumericT> const & y7 = vec_tuple.const_at(current_index + 7);
+ viennacl::ocl::enqueue(inner_prod_kernel_8( viennacl::traits::opencl_handle(x), layout_x,
+ viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
+ viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
+ viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
+ viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
+ viennacl::traits::opencl_handle(y4), detail::make_layout(y4),
+ viennacl::traits::opencl_handle(y5), detail::make_layout(y5),
+ viennacl::traits::opencl_handle(y6), detail::make_layout(y6),
+ viennacl::traits::opencl_handle(y7), detail::make_layout(y7),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 8 * inner_prod_kernel_8.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ) );
+
+ ksum.global_work_size(0, 8 * ksum.local_work_size(0));
+ viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(work_groups),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * 8 * ksum.local_work_size()),
+ viennacl::traits::opencl_handle(result),
+ cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
+ cl_uint(viennacl::traits::stride(result))
+ )
+ );
+ }
+ current_index += 8;
+ break;
+ }
+ }
+
+}
+
+
+
+//implementation of inner product:
+//namespace {
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template <typename T>
+void inner_prod_cpu(vector_base<T> const & vec1,
+ vector_base<T> const & vec2,
+ T & result)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
+ temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
+
+ // Step 1: Compute partial inner products for each work group:
+ inner_prod_impl(vec1, vec2, temp);
+
+ // Step 2: Sum partial results:
+
+ // Now copy partial results from GPU back to CPU and run reduction there:
+ std::vector<T> temp_cpu(work_groups);
+ viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+ result = 0;
+ for (typename std::vector<T>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+ result += *it;
+}
+
+
+//////////// Helper for norms
+
+/** @brief Computes the partial work group results for vector norms
+*
+* @param vec The vector
+* @param partial_result The result scalar
+* @param norm_id Norm selector. 0: norm_inf, 1: norm_1, 2: norm_2
+*/
+template <typename T>
+void norm_reduction_impl(vector_base<T> const & vec,
+ vector_base<T> & partial_result,
+ cl_uint norm_id)
+{
+ assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "norm");
+
+ assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in norm_reduction_impl()") );
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
+ cl_uint(viennacl::traits::start(vec)),
+ cl_uint(viennacl::traits::stride(vec)),
+ cl_uint(viennacl::traits::size(vec)),
+ cl_uint(norm_id),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+ viennacl::traits::opencl_handle(partial_result) )
+ );
+}
+
+
+//////////// Norm 1
+
+/** @brief Computes the l^1-norm of a vector
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_1_impl(vector_base<T> const & vec,
+ scalar<T> & result)
+{
+ assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+ // Step 1: Compute the partial work group results
+ norm_reduction_impl(vec, temp, 1);
+
+ // Step 2: Compute the partial reduction using OpenCL
+ viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+ ksum.global_work_size(0, ksum.local_work_size(0));
+ viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(viennacl::traits::start(temp)),
+ cl_uint(viennacl::traits::stride(temp)),
+ cl_uint(viennacl::traits::size(temp)),
+ cl_uint(1),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+ result)
+ );
+}
+
+/** @brief Computes the l^1-norm of a vector with final reduction on CPU
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_1_cpu(vector_base<T> const & vec,
+ T & result)
+{
+ vcl_size_t work_groups = 128;
+ viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+ // Step 1: Compute the partial work group results
+ norm_reduction_impl(vec, temp, 1);
+
+ // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+ typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
+
+ CPUVectorType temp_cpu(work_groups);
+ viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+ result = 0;
+ for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+ result += static_cast<T>(*it);
+}
+
+
+
+//////// Norm 2
+
+
+/** @brief Computes the l^2-norm of a vector - implementation using OpenCL summation at second step
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_2_impl(vector_base<T> const & vec,
+ scalar<T> & result)
+{
+ assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+ // Step 1: Compute the partial work group results
+ norm_reduction_impl(vec, temp, 2);
+
+ // Step 2: Reduction via OpenCL
+ viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+ ksum.global_work_size(0, ksum.local_work_size(0));
+ viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(viennacl::traits::start(temp)),
+ cl_uint(viennacl::traits::stride(temp)),
+ cl_uint(viennacl::traits::size(temp)),
+ cl_uint(2),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+ result)
+ );
+}
+
+/** @brief Computes the l^1-norm of a vector with final reduction on CPU
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_2_cpu(vector_base<T> const & vec,
+ T & result)
+{
+ vcl_size_t work_groups = 128;
+ viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+ // Step 1: Compute the partial work group results
+ norm_reduction_impl(vec, temp, 2);
+
+ // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+ typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
+
+ CPUVectorType temp_cpu(work_groups);
+ viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+ result = 0;
+ for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+ result += static_cast<T>(*it);
+ result = std::sqrt(result);
+}
+
+
+
+////////// Norm inf
+
+/** @brief Computes the supremum-norm of a vector
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_inf_impl(vector_base<T> const & vec,
+ scalar<T> & result)
+{
+ assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+ // Step 1: Compute the partial work group results
+ norm_reduction_impl(vec, temp, 0);
+
+ //part 2: parallel reduction of reduced kernel:
+ viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
+
+ ksum.global_work_size(0, ksum.local_work_size(0));
+ viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
+ cl_uint(viennacl::traits::start(temp)),
+ cl_uint(viennacl::traits::stride(temp)),
+ cl_uint(viennacl::traits::size(temp)),
+ cl_uint(0),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
+ result)
+ );
+}
+
+/** @brief Computes the supremum-norm of a vector
+*
+* @param vec The vector
+* @param result The result scalar
+*/
+template <typename T>
+void norm_inf_cpu(vector_base<T> const & vec,
+ T & result)
+{
+ vcl_size_t work_groups = 128;
+ viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
+
+ // Step 1: Compute the partial work group results
+ norm_reduction_impl(vec, temp, 0);
+
+ // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+ typedef std::vector<typename viennacl::result_of::cl_type<T>::type> CPUVectorType;
+
+ CPUVectorType temp_cpu(work_groups);
+ viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+ result = 0;
+ for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+ result = std::max(result, static_cast<T>(*it));
+}
+
+
+/////////// index norm_inf
+
+//This function should return a CPU scalar, otherwise statements like
+// vcl_rhs[index_norm_inf(vcl_rhs)]
+// are ambiguous
+/** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+*
+* @param vec The vector
+* @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+*/
+template <typename T>
+cl_uint index_norm_inf(vector_base<T> const & vec)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ viennacl::ocl::handle<cl_mem> h = ctx.create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "index_norm_inf");
+ //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
+
+ //TODO: Use multi-group kernel for large vector sizes
+
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
+ cl_uint(viennacl::traits::start(vec)),
+ cl_uint(viennacl::traits::stride(vec)),
+ cl_uint(viennacl::traits::size(vec)),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
+ viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
+
+ //read value:
+ cl_uint result;
+ cl_int err = clEnqueueReadBuffer(ctx.get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
+ VIENNACL_ERR_CHECK(err);
+ return result;
+}
+
+
+////////// max
+
+/** @brief Computes the maximum value of a vector, where the result is stored in an OpenCL buffer.
+*
+* @param x The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void max_impl(vector_base<NumericT> const & x,
+ scalar<NumericT> & result)
+{
+ assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+ viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<NumericT> temp(work_groups, viennacl::traits::context(x));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "max_kernel");
+
+ k.global_work_size(0, work_groups * k.local_work_size(0));
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
+ cl_uint(viennacl::traits::start(x)),
+ cl_uint(viennacl::traits::stride(x)),
+ cl_uint(viennacl::traits::size(x)),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ));
+
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(temp),
+ cl_uint(viennacl::traits::start(temp)),
+ cl_uint(viennacl::traits::stride(temp)),
+ cl_uint(viennacl::traits::size(temp)),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+ viennacl::traits::opencl_handle(result)
+ ));
+}
+
+/** @brief Computes the maximum value of a vector, where the value is stored in a host value.
+*
+* @param x The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void max_cpu(vector_base<NumericT> const & x,
+ NumericT & result)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+ viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<NumericT> temp(work_groups, viennacl::traits::context(x));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "max_kernel");
+
+ k.global_work_size(0, work_groups * k.local_work_size(0));
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
+ cl_uint(viennacl::traits::start(x)),
+ cl_uint(viennacl::traits::stride(x)),
+ cl_uint(viennacl::traits::size(x)),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ));
+
+ // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+ typedef std::vector<typename viennacl::result_of::cl_type<NumericT>::type> CPUVectorType;
+
+ CPUVectorType temp_cpu(work_groups);
+ viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+ result = static_cast<NumericT>(temp_cpu[0]);
+ for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+ result = std::max(result, static_cast<NumericT>(*it));
+
+}
+
+
+////////// min
+
+/** @brief Computes the minimum of a vector, where the result is stored in an OpenCL buffer.
+*
+* @param x The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void min_impl(vector_base<NumericT> const & x,
+ scalar<NumericT> & result)
+{
+ assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+ viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<NumericT> temp(work_groups, viennacl::traits::context(x));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "min_kernel");
+
+ k.global_work_size(0, work_groups * k.local_work_size(0));
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
+ cl_uint(viennacl::traits::start(x)),
+ cl_uint(viennacl::traits::stride(x)),
+ cl_uint(viennacl::traits::size(x)),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ));
+
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(temp),
+ cl_uint(viennacl::traits::start(temp)),
+ cl_uint(viennacl::traits::stride(temp)),
+ cl_uint(viennacl::traits::size(temp)),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+ viennacl::traits::opencl_handle(result)
+ ));
+}
+
+/** @brief Computes the minimum of a vector, where the result is stored on a CPU scalar.
+*
+* @param x The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void min_cpu(vector_base<NumericT> const & x,
+ NumericT & result)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+ viennacl::linalg::opencl::kernels::vector<NumericT>::init(ctx);
+
+ vcl_size_t work_groups = 128;
+ viennacl::vector<NumericT> temp(work_groups, viennacl::traits::context(x));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<NumericT>::program_name(), "min_kernel");
+
+ k.global_work_size(0, work_groups * k.local_work_size(0));
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(x),
+ cl_uint(viennacl::traits::start(x)),
+ cl_uint(viennacl::traits::stride(x)),
+ cl_uint(viennacl::traits::size(x)),
+ viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<NumericT>::type) * k.local_work_size()),
+ viennacl::traits::opencl_handle(temp)
+ ));
+
+ // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
+ typedef std::vector<typename viennacl::result_of::cl_type<NumericT>::type> CPUVectorType;
+
+ CPUVectorType temp_cpu(work_groups);
+ viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+ result = static_cast<NumericT>(temp_cpu[0]);
+ for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+ result = std::min(result, static_cast<NumericT>(*it));
+}
+
+////////// sum
+
+/** @brief Computes the sum over all entries of a vector
+*
+* @param x The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void sum_impl(vector_base<NumericT> const & x,
+ scalar<NumericT> & result)
+{
+ assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::vector<NumericT> all_ones = viennacl::scalar_vector<NumericT>(x.size(), NumericT(1), viennacl::traits::context(x));
+ viennacl::linalg::opencl::inner_prod_impl(x, all_ones, result);
+}
+
+/** @brief Computes the sum over all entries of a vector.
+*
+* @param x The vector
+* @param result The result scalar
+*/
+template<typename NumericT>
+void sum_cpu(vector_base<NumericT> const & x, NumericT & result)
+{
+ scalar<NumericT> tmp(0, viennacl::traits::context(x));
+ sum_impl(x, tmp);
+ result = tmp;
+}
+
+
+//TODO: Special case vec1 == vec2 allows improvement!!
+/** @brief Computes a plane rotation of two vectors.
+*
+* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param alpha The first transformation coefficient
+* @param beta The second transformation coefficient
+*/
+template <typename T>
+void plane_rotation(vector_base<T> & vec1,
+ vector_base<T> & vec2,
+ T alpha, T beta)
+{
+ assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
+ viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
+
+ assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "plane_rotation");
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
+ cl_uint(viennacl::traits::start(vec1)),
+ cl_uint(viennacl::traits::stride(vec1)),
+ cl_uint(viennacl::traits::size(vec1)),
+ viennacl::traits::opencl_handle(vec2),
+ cl_uint(viennacl::traits::start(vec2)),
+ cl_uint(viennacl::traits::stride(vec2)),
+ cl_uint(viennacl::traits::size(vec2)),
+ viennacl::traits::opencl_handle(alpha),
+ viennacl::traits::opencl_handle(beta))
+ );
+}
+
+
+//////////////////////////
+
+
+namespace detail
+{
+ /** @brief Worker routine for scan routines using OpenCL
+ *
+ * Note on performance: For non-in-place scans one could optimize away the temporary 'opencl_carries'-array.
+ * This, however, only provides small savings in the latency-dominated regime, yet would effectively double the amount of code to maintain.
+ */
+ template<typename NumericT>
+ void scan_impl(vector_base<NumericT> const & input,
+ vector_base<NumericT> & output,
+ bool is_inclusive)
+ {
+ vcl_size_t local_worksize = 128;
+ vcl_size_t workgroups = 128;
+
+ viennacl::backend::mem_handle opencl_carries;
+ viennacl::backend::memory_create(opencl_carries, sizeof(NumericT)*workgroups, viennacl::traits::context(input));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+ viennacl::linalg::opencl::kernels::scan<NumericT>::init(ctx);
+ viennacl::ocl::kernel& k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::scan<NumericT>::program_name(), "scan_1");
+ viennacl::ocl::kernel& k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::scan<NumericT>::program_name(), "scan_2");
+ viennacl::ocl::kernel& k3 = ctx.get_kernel(viennacl::linalg::opencl::kernels::scan<NumericT>::program_name(), "scan_3");
+
+ // First step: Scan within each thread group and write carries
+ k1.local_work_size(0, local_worksize);
+ k1.global_work_size(0, workgroups * local_worksize);
+ viennacl::ocl::enqueue(k1( input, cl_uint( input.start()), cl_uint( input.stride()), cl_uint(input.size()),
+ output, cl_uint(output.start()), cl_uint(output.stride()),
+ cl_uint(is_inclusive ? 0 : 1), opencl_carries.opencl_handle())
+ );
+
+ // Second step: Compute offset for each thread group (exclusive scan for each thread group)
+ k2.local_work_size(0, workgroups);
+ k2.global_work_size(0, workgroups);
+ viennacl::ocl::enqueue(k2(opencl_carries.opencl_handle()));
+
+ // Third step: Offset each thread group accordingly
+ k3.local_work_size(0, local_worksize);
+ k3.global_work_size(0, workgroups * local_worksize);
+ viennacl::ocl::enqueue(k3(output, cl_uint(output.start()), cl_uint(output.stride()), cl_uint(output.size()),
+ opencl_carries.opencl_handle())
+ );
+ }
+}
+
+
+/** @brief This function implements an inclusive scan using CUDA.
+*
+* @param input Input vector.
+* @param output The output vector. Either idential to input or non-overlapping.
+*/
+template<typename NumericT>
+void inclusive_scan(vector_base<NumericT> const & input,
+ vector_base<NumericT> & output)
+{
+ detail::scan_impl(input, output, true);
+}
+
+
+/** @brief This function implements an exclusive scan using CUDA.
+*
+* @param input Input vector
+* @param output The output vector. Either idential to input or non-overlapping.
+*/
+template<typename NumericT>
+void exclusive_scan(vector_base<NumericT> const & input,
+ vector_base<NumericT> & output)
+{
+ detail::scan_impl(input, output, false);
+}
+
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/power_iter.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/power_iter.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/power_iter.hpp
new file mode 100644
index 0000000..9721517
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/power_iter.hpp
@@ -0,0 +1,129 @@
+#ifndef VIENNACL_LINALG_POWER_ITER_HPP_
+#define VIENNACL_LINALG_POWER_ITER_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/power_iter.hpp
+ @brief Defines a tag for the configuration of the power iteration method.
+
+ Contributed by Astrid Rupp.
+*/
+
+#include <cmath>
+#include <vector>
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+
+namespace viennacl
+{
+ namespace linalg
+ {
+ /** @brief A tag for the power iteration algorithm. */
+ class power_iter_tag
+ {
+ public:
+
+ /** @brief The constructor
+ *
+ * @param tfac If the eigenvalue does not change more than this termination factor, the algorithm stops
+ * @param max_iters Maximum number of iterations for the power iteration
+ */
+ power_iter_tag(double tfac = 1e-8, vcl_size_t max_iters = 50000) : termination_factor_(tfac), max_iterations_(max_iters) {}
+
+ /** @brief Sets the factor for termination */
+ void factor(double fct){ termination_factor_ = fct; }
+
+ /** @brief Returns the factor for termination */
+ double factor() const { return termination_factor_; }
+
+ vcl_size_t max_iterations() const { return max_iterations_; }
+ void max_iterations(vcl_size_t new_max) { max_iterations_ = new_max; }
+
+ private:
+ double termination_factor_;
+ vcl_size_t max_iterations_;
+
+ };
+
+ /**
+ * @brief Implementation of the calculation of the largest eigenvalue (in modulus) and the associated eigenvector using power iteration
+ *
+ * @param A The system matrix
+ * @param tag Tag with termination factor
+ * @param eigenvec Vector which holds the associated eigenvector once the routine completes
+ * @return Returns the largest eigenvalue computed by the power iteration method
+ */
+ template<typename MatrixT, typename VectorT >
+ typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
+ eig(MatrixT const& A, power_iter_tag const & tag, VectorT & eigenvec)
+ {
+
+ typedef typename viennacl::result_of::value_type<MatrixT>::type ScalarType;
+ typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type CPU_ScalarType;
+
+ vcl_size_t matrix_size = A.size1();
+ VectorT r(eigenvec);
+ std::vector<CPU_ScalarType> s(matrix_size);
+
+ for (vcl_size_t i=0; i<s.size(); ++i)
+ s[i] = CPU_ScalarType(i % 3) * CPU_ScalarType(0.1234) - CPU_ScalarType(0.5); //'random' starting vector
+
+ detail::copy_vec_to_vec(s, eigenvec);
+
+ double epsilon = tag.factor();
+ CPU_ScalarType norm = norm_2(eigenvec);
+ CPU_ScalarType norm_prev = 0;
+ long numiter = 0;
+
+ for (vcl_size_t i=0; i<tag.max_iterations(); ++i)
+ {
+ if (std::fabs(norm - norm_prev) / std::fabs(norm) < epsilon)
+ break;
+
+ eigenvec /= norm;
+ r = viennacl::linalg::prod(A, eigenvec); //using helper vector r for the computation of x <- A * x in order to avoid the repeated creation of temporaries
+ eigenvec = r;
+ norm_prev = norm;
+ norm = norm_2(eigenvec);
+ numiter++;
+ }
+
+ return norm;
+ }
+
+ /**
+ * @brief Implementation of the calculation of eigenvalues using power iteration. Does not return the eigenvector.
+ *
+ * @param A The system matrix
+ * @param tag Tag with termination factor
+ * @return Returns the largest eigenvalue computed by the power iteration method
+ */
+ template< typename MatrixT >
+ typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type
+ eig(MatrixT const& A, power_iter_tag const & tag)
+ {
+ typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type VectorT;
+
+ VectorT eigenvec(A.size1());
+ return eig(A, tag, eigenvec);
+ }
+
+ } // end namespace linalg
+} // end namespace viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/prod.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/prod.hpp
new file mode 100644
index 0000000..af041dc
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/prod.hpp
@@ -0,0 +1,370 @@
+#ifndef VIENNACL_LINALG_PROD_HPP_
+#define VIENNACL_LINALG_PROD_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/prod.hpp
+ @brief Generic interface for matrix-vector and matrix-matrix products.
+ See viennacl/linalg/vector_operations.hpp, viennacl/linalg/matrix_operations.hpp, and
+ viennacl/linalg/sparse_matrix_operations.hpp for implementations.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+#include <vector>
+#include <map>
+
+namespace viennacl
+{
+ //
+ // generic prod function
+ // uses tag dispatch to identify which algorithm
+ // should be called
+ //
+ namespace linalg
+ {
+ #ifdef VIENNACL_WITH_MTL4
+ // ----------------------------------------------------
+ // mtl4
+ //
+ template< typename MatrixT, typename VectorT >
+ typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+ VectorT>::type
+ prod(MatrixT const& matrix, VectorT const& vector)
+ {
+ return VectorT(matrix * vector);
+ }
+ #endif
+
+ #ifdef VIENNACL_WITH_ARMADILLO
+ // ----------------------------------------------------
+ // Armadillo
+ //
+ template<typename NumericT, typename VectorT>
+ VectorT prod(arma::SpMat<NumericT> const& A, VectorT const& vector)
+ {
+ return A * vector;
+ }
+ #endif
+
+ #ifdef VIENNACL_WITH_EIGEN
+ // ----------------------------------------------------
+ // Eigen
+ //
+ template< typename MatrixT, typename VectorT >
+ typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+ VectorT>::type
+ prod(MatrixT const& matrix, VectorT const& vector)
+ {
+ return matrix * vector;
+ }
+ #endif
+
+ #ifdef VIENNACL_WITH_UBLAS
+ // ----------------------------------------------------
+ // UBLAS
+ //
+ template< typename MatrixT, typename VectorT >
+ typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< MatrixT >::type >::value,
+ VectorT>::type
+ prod(MatrixT const& matrix, VectorT const& vector)
+ {
+ // std::cout << "ublas .. " << std::endl;
+ return boost::numeric::ublas::prod(matrix, vector);
+ }
+ #endif
+
+
+ // ----------------------------------------------------
+ // STL type
+ //
+
+ // dense matrix-vector product:
+ template< typename T, typename A1, typename A2, typename VectorT >
+ VectorT
+ prod(std::vector< std::vector<T, A1>, A2 > const & matrix, VectorT const& vector)
+ {
+ VectorT result(matrix.size());
+ for (typename std::vector<T, A1>::size_type i=0; i<matrix.size(); ++i)
+ {
+ result[i] = 0; //we will not assume that VectorT is initialized to zero
+ for (typename std::vector<T, A1>::size_type j=0; j<matrix[i].size(); ++j)
+ result[i] += matrix[i][j] * vector[j];
+ }
+ return result;
+ }
+
+ // sparse matrix-vector product:
+ template< typename KEY, typename DATA, typename COMPARE, typename AMAP, typename AVEC, typename VectorT >
+ VectorT
+ prod(std::vector< std::map<KEY, DATA, COMPARE, AMAP>, AVEC > const& matrix, VectorT const& vector)
+ {
+ typedef std::vector< std::map<KEY, DATA, COMPARE, AMAP>, AVEC > MatrixType;
+
+ VectorT result(matrix.size());
+ for (typename MatrixType::size_type i=0; i<matrix.size(); ++i)
+ {
+ result[i] = 0; //we will not assume that VectorT is initialized to zero
+ for (typename std::map<KEY, DATA, COMPARE, AMAP>::const_iterator row_entries = matrix[i].begin();
+ row_entries != matrix[i].end();
+ ++row_entries)
+ result[i] += row_entries->second * vector[row_entries->first];
+ }
+ return result;
+ }
+
+
+ /*template< typename MatrixT, typename VectorT >
+ VectorT
+ prod(MatrixT const& matrix, VectorT const& vector,
+ typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< MatrixT >::type >::value
+ >::type* dummy = 0)
+ {
+ // std::cout << "std .. " << std::endl;
+ return prod_impl(matrix, vector);
+ }*/
+
+ // ----------------------------------------------------
+ // VIENNACL
+ //
+
+ // standard product:
+ template<typename NumericT>
+ viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_mat_mat_prod >
+ prod(viennacl::matrix_base<NumericT> const & A,
+ viennacl::matrix_base<NumericT> const & B)
+ {
+ return viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_mat_mat_prod >(A, B);
+ }
+
+ // right factor is a matrix expression:
+ template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
+ viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+ viennacl::op_mat_mat_prod >
+ prod(viennacl::matrix_base<NumericT> const & A,
+ viennacl::matrix_expression<const LhsT, const RhsT, OpT> const & B)
+ {
+ return viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+ viennacl::op_mat_mat_prod >(A, B);
+ }
+
+ // left factor is a matrix expression:
+ template<typename LhsT, typename RhsT, typename OpT, typename NumericT>
+ viennacl::matrix_expression< const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_mat_mat_prod >
+ prod(viennacl::matrix_expression<const LhsT, const RhsT, OpT> const & A,
+ viennacl::matrix_base<NumericT> const & B)
+ {
+ return viennacl::matrix_expression< const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_mat_mat_prod >(A, B);
+ }
+
+
+ // both factors transposed:
+ template<typename LhsT1, typename RhsT1, typename OpT1,
+ typename LhsT2, typename RhsT2, typename OpT2>
+ viennacl::matrix_expression< const viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1>,
+ const viennacl::matrix_expression<const LhsT2, const RhsT2, OpT2>,
+ viennacl::op_mat_mat_prod >
+ prod(viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1> const & A,
+ viennacl::matrix_expression<const LhsT2, const RhsT2, OpT2> const & B)
+ {
+ return viennacl::matrix_expression< const viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1>,
+ const viennacl::matrix_expression<const LhsT2, const RhsT2, OpT2>,
+ viennacl::op_mat_mat_prod >(A, B);
+ }
+
+
+
+ // matrix-vector product
+ template< typename NumericT>
+ viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >
+ prod(viennacl::matrix_base<NumericT> const & A,
+ viennacl::vector_base<NumericT> const & x)
+ {
+ return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >(A, x);
+ }
+
+ // matrix-vector product (resolve ambiguity)
+ template<typename NumericT, typename F>
+ viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >
+ prod(viennacl::matrix<NumericT, F> const & A,
+ viennacl::vector_base<NumericT> const & x)
+ {
+ return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >(A, x);
+ }
+
+ // matrix-vector product (resolve ambiguity)
+ template<typename MatrixT, typename NumericT>
+ viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >
+ prod(viennacl::matrix_range<MatrixT> const & A,
+ viennacl::vector_base<NumericT> const & x)
+ {
+ return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >(A, x);
+ }
+
+ // matrix-vector product (resolve ambiguity)
+ template<typename MatrixT, typename NumericT>
+ viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >
+ prod(viennacl::matrix_slice<MatrixT> const & A,
+ viennacl::vector_base<NumericT> const & x)
+ {
+ return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >(A, x);
+ }
+
+ // matrix-vector product with matrix expression (including transpose)
+ template< typename NumericT, typename LhsT, typename RhsT, typename OpT>
+ viennacl::vector_expression< const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >
+ prod(viennacl::matrix_expression<const LhsT, const RhsT, OpT> const & A,
+ viennacl::vector_base<NumericT> const & x)
+ {
+ return viennacl::vector_expression< const viennacl::matrix_expression<const LhsT, const RhsT, OpT>,
+ const viennacl::vector_base<NumericT>,
+ viennacl::op_prod >(A, x);
+ }
+
+
+ // matrix-vector product with vector expression
+ template< typename NumericT, typename LhsT, typename RhsT, typename OpT>
+ viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_expression<const LhsT, const RhsT, OpT>,
+ viennacl::op_prod >
+ prod(viennacl::matrix_base<NumericT> const & A,
+ viennacl::vector_expression<const LhsT, const RhsT, OpT> const & x)
+ {
+ return viennacl::vector_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::vector_expression<const LhsT, const RhsT, OpT>,
+ viennacl::op_prod >(A, x);
+ }
+
+
+ // matrix-vector product with matrix expression (including transpose) and vector expression
+ template<typename LhsT1, typename RhsT1, typename OpT1,
+ typename LhsT2, typename RhsT2, typename OpT2>
+ viennacl::vector_expression< const viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1>,
+ const viennacl::vector_expression<const LhsT2, const RhsT2, OpT2>,
+ viennacl::op_prod >
+ prod(viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1> const & A,
+ viennacl::vector_expression<const LhsT2, const RhsT2, OpT2> const & x)
+ {
+ return viennacl::vector_expression< const viennacl::matrix_expression<const LhsT1, const RhsT1, OpT1>,
+ const viennacl::vector_expression<const LhsT2, const RhsT2, OpT2>,
+ viennacl::op_prod >(A, x);
+ }
+
+
+
+
+ template< typename SparseMatrixType, typename SCALARTYPE>
+ typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+ viennacl::matrix_expression<const SparseMatrixType,
+ const matrix_base <SCALARTYPE>,
+ op_prod >
+ >::type
+ prod(const SparseMatrixType & sp_mat,
+ const viennacl::matrix_base<SCALARTYPE> & d_mat)
+ {
+ return viennacl::matrix_expression<const SparseMatrixType,
+ const viennacl::matrix_base<SCALARTYPE>,
+ op_prod >(sp_mat, d_mat);
+ }
+
+ // right factor is transposed
+ template< typename SparseMatrixType, typename SCALARTYPE>
+ typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+ viennacl::matrix_expression< const SparseMatrixType,
+ const viennacl::matrix_expression<const viennacl::matrix_base<SCALARTYPE>,
+ const viennacl::matrix_base<SCALARTYPE>,
+ op_trans>,
+ viennacl::op_prod >
+ >::type
+ prod(const SparseMatrixType & A,
+ viennacl::matrix_expression<const viennacl::matrix_base<SCALARTYPE>,
+ const viennacl::matrix_base<SCALARTYPE>,
+ op_trans> const & B)
+ {
+ return viennacl::matrix_expression< const SparseMatrixType,
+ const viennacl::matrix_expression<const viennacl::matrix_base<SCALARTYPE>,
+ const viennacl::matrix_base<SCALARTYPE>,
+ op_trans>,
+ viennacl::op_prod >(A, B);
+ }
+
+
+ /** @brief Sparse matrix-matrix product with compressed_matrix objects */
+ template<typename NumericT>
+ viennacl::matrix_expression<const compressed_matrix<NumericT>,
+ const compressed_matrix<NumericT>,
+ op_prod >
+ prod(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> const & B)
+ {
+ return viennacl::matrix_expression<const compressed_matrix<NumericT>,
+ const compressed_matrix<NumericT>,
+ op_prod >(A, B);
+ }
+
+ /** @brief Generic matrix-vector product with user-provided sparse matrix type */
+ template<typename SparseMatrixType, typename NumericT>
+ vector_expression<const SparseMatrixType,
+ const vector_base<NumericT>,
+ op_prod >
+ prod(const SparseMatrixType & A,
+ const vector_base<NumericT> & x)
+ {
+ return vector_expression<const SparseMatrixType,
+ const vector_base<NumericT>,
+ op_prod >(A, x);
+ }
+
+ } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+
[24/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
new file mode 100644
index 0000000..dba094b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
@@ -0,0 +1,832 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/spai.hpp
+ @brief Main implementation of SPAI (not FSPAI). Experimental.
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+//local includes
+#include "viennacl/linalg/detail/spai/spai_tag.hpp"
+#include "viennacl/linalg/qr.hpp"
+#include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
+#include "viennacl/linalg/detail/spai/spai-static.hpp"
+#include "viennacl/linalg/detail/spai/sparse_vector.hpp"
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+
+//boost includes
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
+
+
+
+#define VIENNACL_SPAI_K_b 20
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+//debug function for print
+template<typename SparseVectorT>
+void print_sparse_vector(SparseVectorT const & v)
+{
+ for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it)
+ std::cout << "[ " << vec_it->first << " ]:" << vec_it->second << std::endl;
+}
+
+template<typename DenseMatrixT>
+void print_matrix(DenseMatrixT & m)
+{
+ for (int i = 0; i < m.size2(); ++i)
+ {
+ for (int j = 0; j < m.size1(); ++j)
+ std::cout<<m(j, i)<<" ";
+ std::cout<<std::endl;
+ }
+}
+
+/** @brief Add two sparse vectors res_v = b*v
+ *
+ * @param v initial sparse vector
+ * @param b scalar
+ * @param res_v output vector
+ */
+template<typename SparseVectorT, typename NumericT>
+void add_sparse_vectors(SparseVectorT const & v, NumericT b, SparseVectorT & res_v)
+{
+ for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)
+ res_v[v_it->first] += b*v_it->second;
+}
+
+//sparse-matrix - vector product
+/** @brief Computation of residual res = A*v - e
+ *
+ * @param A_v_c column major vectorized input sparse matrix
+ * @param v sparse vector, in this case new column of preconditioner matrix
+ * @param ind index for current column
+ * @param res residual
+ */
+template<typename SparseVectorT, typename NumericT>
+void compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,
+ SparseVectorT const & v,
+ unsigned int ind,
+ SparseVectorT & res)
+{
+ for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)
+ add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
+
+ res[ind] -= NumericT(1);
+}
+
+/** @brief Setting up index set of columns and rows for certain column
+ *
+ * @param A_v_c column major vectorized initial sparse matrix
+ * @param v current column of preconditioner matrix
+ * @param J set of column indices
+ * @param I set of row indices
+ */
+template<typename SparseVectorT>
+void build_index_set(std::vector<SparseVectorT> const & A_v_c,
+ SparseVectorT const & v,
+ std::vector<unsigned int> & J,
+ std::vector<unsigned int> & I)
+{
+ buildColumnIndexSet(v, J);
+ projectRows(A_v_c, J, I);
+}
+
+/** @brief Initializes a dense matrix from a sparse one
+ *
+ * @param A_in Oiginal sparse matrix
+ * @param J Set of column indices
+ * @param I Set of row indices
+ * @param A_out dense matrix output
+ */
+template<typename SparseMatrixT, typename DenseMatrixT>
+void initProjectSubMatrix(SparseMatrixT const & A_in,
+ std::vector<unsigned int> const & J,
+ std::vector<unsigned int> & I,
+ DenseMatrixT & A_out)
+{
+ A_out.resize(I.size(), J.size(), false);
+ for (vcl_size_t j = 0; j < J.size(); ++j)
+ for (vcl_size_t i = 0; i < I.size(); ++i)
+ A_out(i,j) = A_in(I[i],J[j]);
+}
+
+
+/************************************************** CPU BLOCK SET UP ***************************************/
+
+/** @brief Setting up blocks and QR factorizing them on CPU
+ *
+ * @param A initial sparse matrix
+ * @param A_v_c column major vectorized initial sparse matrix
+ * @param M_v initialized preconditioner
+ * @param g_I container of row indices
+ * @param g_J container of column indices
+ * @param g_A_I_J container of dense matrices -> R matrices after QR factorization
+ * @param g_b_v container of vectors beta, necessary for Q recovery
+ */
+template<typename SparseMatrixT, typename DenseMatrixT, typename SparseVectorT, typename VectorT>
+void block_set_up(SparseMatrixT const & A,
+ std::vector<SparseVectorT> const & A_v_c,
+ std::vector<SparseVectorT> const & M_v,
+ std::vector<std::vector<unsigned int> >& g_I,
+ std::vector<std::vector<unsigned int> >& g_J,
+ std::vector<DenseMatrixT>& g_A_I_J,
+ std::vector<VectorT>& g_b_v)
+{
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+ {
+ vcl_size_t i = static_cast<vcl_size_t>(i2);
+ build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
+ initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
+ //print_matrix(g_A_I_J[i]);
+ single_qr(g_A_I_J[i], g_b_v[i]);
+ //print_matrix(g_A_I_J[i]);
+ }
+}
+
+/** @brief Setting up index set of columns and rows for all columns
+ *
+ * @param A_v_c column major vectorized initial sparse matrix
+ * @param M_v initialized preconditioner
+ * @param g_J container of column indices
+ * @param g_I container of row indices
+ */
+template<typename SparseVectorT>
+void index_set_up(std::vector<SparseVectorT> const & A_v_c,
+ std::vector<SparseVectorT> const & M_v,
+ std::vector<std::vector<unsigned int> > & g_J,
+ std::vector<std::vector<unsigned int> > & g_I)
+{
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+ {
+ vcl_size_t i = static_cast<vcl_size_t>(i2);
+ build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
+ }
+}
+
+/************************************************** GPU BLOCK SET UP ***************************************/
+
+/** @brief Setting up blocks and QR factorizing them on GPU
+ *
+ * @param A initial sparse matrix
+ * @param A_v_c column major vectorized initial sparse matrix
+ * @param M_v initialized preconditioner
+ * @param g_is_update container that indicates which blocks are active
+ * @param g_I container of row indices
+ * @param g_J container of column indices
+ * @param g_A_I_J container of dense matrices -> R matrices after QR factorization
+ * @param g_bv container of vectors beta, necessary for Q recovery
+ */
+template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
+void block_set_up(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ std::vector<SparseVectorT> const & A_v_c,
+ std::vector<SparseVectorT> const & M_v,
+ std::vector<cl_uint> g_is_update,
+ std::vector<std::vector<unsigned int> > & g_I,
+ std::vector<std::vector<unsigned int> > & g_J,
+ block_matrix & g_A_I_J,
+ block_vector & g_bv)
+{
+ viennacl::context ctx = viennacl::traits::context(A);
+ bool is_empty_block;
+
+ //build index set
+ index_set_up(A_v_c, M_v, g_J, g_I);
+ block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);
+ block_qr<NumericT>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);
+}
+
+
+/***************************************************************************************************/
+/******************************** SOLVING LS PROBLEMS ON GPU ***************************************/
+/***************************************************************************************************/
+
+/** @brief Elicitation of sparse vector m for particular column from m_in - contigious vector for all columns
+ *
+ * @param m_in contigious sparse vector for all columns
+ * @param start_m_ind start index of particular vector
+ * @param J column index set
+ * @param m sparse vector for particular column
+ */
+template<typename NumericT, typename SparseVectorT>
+void custom_fan_out(std::vector<NumericT> const & m_in,
+ unsigned int start_m_ind,
+ std::vector<unsigned int> const & J,
+ SparseVectorT & m)
+{
+ unsigned int cnt = 0;
+ for (vcl_size_t i = 0; i < J.size(); ++i)
+ m[J[i]] = m_in[start_m_ind + cnt++];
+}
+
+
+
+//GPU based least square problem
+/** @brief Solution of Least square problem on GPU
+ *
+ * @param A_v_c column-major vectorized initial sparse matrix
+ * @param M_v column-major vectorized sparse preconditioner matrix
+ * @param g_I container of row set indices
+ * @param g_J container of column set indices
+ * @param g_A_I_J_vcl contigious matrix that consists of blocks A(I_k, J_k)
+ * @param g_bv_vcl contigious vector that consists of betas, necessary for Q recovery
+ * @param g_res container of residuals
+ * @param g_is_update container with indicators which blocks are active
+ * @param tag spai tag
+ * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename SparseVectorT, typename NumericT>
+void least_square_solve(std::vector<SparseVectorT> & A_v_c,
+ std::vector<SparseVectorT> & M_v,
+ std::vector<std::vector<unsigned int> >& g_I,
+ std::vector<std::vector<unsigned int> > & g_J,
+ block_matrix & g_A_I_J_vcl,
+ block_vector & g_bv_vcl,
+ std::vector<SparseVectorT> & g_res,
+ std::vector<cl_uint> & g_is_update,
+ const spai_tag & tag,
+ viennacl::context ctx)
+{
+ viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+ unsigned int y_sz, m_sz;
+ std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
+ std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
+
+ get_size(g_I, y_sz);
+ init_start_inds(g_I, y_inds);
+ init_start_inds(g_J, m_inds);
+
+ //create y_v
+ std::vector<NumericT> y_v(y_sz, NumericT(0));
+ for (vcl_size_t i = 0; i < M_v.size(); ++i)
+ {
+ for (vcl_size_t j = 0; j < g_I[i].size(); ++j)
+ {
+ if (g_I[i][j] == i)
+ y_v[y_inds[i] + j] = NumericT(1.0);
+ }
+ }
+ //compute m_v
+ get_size(g_J, m_sz);
+ std::vector<NumericT> m_v(m_sz, static_cast<cl_uint>(0));
+
+ block_vector y_v_vcl;
+ block_vector m_v_vcl;
+ //prepearing memory for least square problem on GPU
+ y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(NumericT)*y_v.size()),
+ &(y_v[0]));
+ m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(NumericT)*m_v.size()),
+ &(m_v[0]));
+ y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+ &(y_inds[0]));
+ viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+ &(g_is_update[0]));
+ viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+ viennacl::ocl::kernel & ls_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_least_squares");
+ ls_kernel.local_work_size(0, 1);
+ ls_kernel.global_work_size(0, 256);
+ viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
+ y_v_vcl.handle(), y_v_vcl.handle1(),
+ g_A_I_J_vcl.handle1(), g_is_update_vcl,
+ //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
+ static_cast<unsigned int>(M_v.size())));
+ //copy vector m_v back from GPU to CPU
+ cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),
+ m_v_vcl.handle().get(), CL_TRUE, 0,
+ sizeof(NumericT)*(m_v.size()),
+ &(m_v[0]), 0, NULL, NULL);
+ VIENNACL_ERR_CHECK(vcl_err);
+
+ //fan out vector in parallel
+ //#pragma omp parallel for
+ for (long i = 0; i < static_cast<long>(M_v.size()); ++i)
+ {
+ if (g_is_update[static_cast<vcl_size_t>(i)])
+ {
+ //faned out onto sparse vector
+ custom_fan_out(m_v, m_inds[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], M_v[static_cast<vcl_size_t>(i)]);
+ g_res[static_cast<vcl_size_t>(i)].clear();
+ compute_spai_residual<SparseVectorT, NumericT>(A_v_c, M_v[static_cast<vcl_size_t>(i)], static_cast<unsigned int>(i), g_res[static_cast<vcl_size_t>(i)]);
+ NumericT res_norm = 0;
+ //compute norm of res - just to make sure that this implementatino works correct
+ sparse_norm_2(g_res[static_cast<vcl_size_t>(i)], res_norm);
+ //std::cout<<"Residual norm of column #: "<<i<<std::endl;
+ //std::cout<<res_norm<<std::endl;
+ //std::cout<<"************************"<<std::endl;
+ g_is_update[static_cast<vcl_size_t>(i)] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
+ }
+ }
+}
+
+//CPU based least square problems
+/** @brief Solution of Least square problem on CPU
+ *
+ * @param A_v_c column-major vectorized initial sparse matrix
+ * @param g_R blocks for least square solution
+ * @param g_b_v vectors beta, necessary for Q recovery
+ * @param g_I container of row index set for all columns of matrix M
+ * @param g_J container of column index set for all columns of matrix M
+ * @param g_res container of residuals
+ * @param g_is_update container with indicators which blocks are active
+ * @param M_v column-major vectorized sparse matrix, final preconditioner
+ * @param tag spai tag
+ */
+template<typename SparseVectorT, typename DenseMatrixT, typename VectorT>
+void least_square_solve(std::vector<SparseVectorT> const & A_v_c,
+ std::vector<DenseMatrixT> & g_R,
+ std::vector<VectorT> & g_b_v,
+ std::vector<std::vector<unsigned int> > & g_I,
+ std::vector<std::vector<unsigned int> > & g_J,
+ std::vector<SparseVectorT> & g_res,
+ std::vector<bool> & g_is_update,
+ std::vector<SparseVectorT> & M_v,
+ spai_tag const & tag)
+{
+ typedef typename DenseMatrixT::value_type NumericType;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+ {
+ vcl_size_t i = static_cast<vcl_size_t>(i2);
+ if (g_is_update[i])
+ {
+ VectorT y = boost::numeric::ublas::zero_vector<NumericType>(g_I[i].size());
+
+ projectI<VectorT, NumericType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + long(i)));
+ apply_q_trans_vec(g_R[i], g_b_v[i], y);
+
+ VectorT m_new = boost::numeric::ublas::zero_vector<NumericType>(g_R[i].size2());
+ backwardSolve(g_R[i], y, m_new);
+ fanOutVector(m_new, g_J[i], M_v[i]);
+ g_res[i].clear();
+
+ compute_spai_residual<SparseVectorT, NumericType>(A_v_c, M_v[i], static_cast<unsigned int>(tag.getBegInd() + long(i)), g_res[i]);
+
+ NumericType res_norm = 0;
+ sparse_norm_2(g_res[i], res_norm);
+// std::cout<<"Residual norm of column #: "<<i<<std::endl;
+// std::cout<<res_norm<<std::endl;
+// std::cout<<"************************"<<std::endl;
+ g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());
+ }
+ }
+}
+
+//************************************ UPDATE CHECK ***************************************************//
+
+template<typename VectorType>
+bool is_all_update(VectorType& parallel_is_update)
+{
+ for (unsigned int i = 0; i < parallel_is_update.size(); ++i)
+ {
+ if (parallel_is_update[i])
+ return true;
+ }
+ return false;
+}
+
+//********************************** MATRIX VECTORIZATION ***********************************************//
+
+//Matrix vectorization, column based approach
+/** @brief Solution of Least square problem on CPU
+ *
+ * @param M_in input sparse, boost::numeric::ublas::compressed_matrix
+ * @param M_v array of sparse vectors
+ */
+template<typename SparseMatrixT, typename SparseVectorT>
+void vectorize_column_matrix(SparseMatrixT const & M_in,
+ std::vector<SparseVectorT> & M_v)
+{
+ for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)
+ for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it;
+}
+
+//Matrix vectorization row based approach
+template<typename SparseMatrixT, typename SparseVectorT>
+void vectorize_row_matrix(SparseMatrixT const & M_in,
+ std::vector<SparseVectorT> & M_v)
+{
+ for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)
+ for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it;
+}
+
+//************************************* BLOCK ASSEMBLY CODE *********************************************//
+
+
+template<typename SizeT>
+void write_set_to_array(std::vector<std::vector<SizeT> > const & ind_set,
+ std::vector<cl_uint> & a)
+{
+ vcl_size_t cnt = 0;
+
+ for (vcl_size_t i = 0; i < ind_set.size(); ++i)
+ for (vcl_size_t j = 0; j < ind_set[i].size(); ++j)
+ a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
+}
+
+
+
+//assembling blocks on GPU
+/** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J
+ *
+ * @param A intial sparse matrix
+ * @param g_J container of column index set
+ * @param g_I container of row index set
+ * @param g_A_I_J_vcl contigious blocks A(I, J) using GPU memory
+ * @param g_is_update container with indicators which blocks are active
+ * @param is_empty_block parameter that indicates if no block were assembled
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void block_assembly(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ std::vector<std::vector<unsigned int> > const & g_J,
+ std::vector<std::vector<unsigned int> > const & g_I,
+ block_matrix & g_A_I_J_vcl,
+ std::vector<cl_uint> & g_is_update,
+ bool & is_empty_block)
+{
+ //computing start indices for index sets and start indices for block matrices
+ unsigned int sz_I, sz_J, sz_blocks;
+ std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+ std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+ std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+ std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+ //
+ init_start_inds(g_J, j_ind);
+ init_start_inds(g_I, i_ind);
+ //
+ get_size(g_J, sz_J);
+ get_size(g_I, sz_I);
+ std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
+ //
+ std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
+
+ // computing size for blocks
+ // writing set to arrays
+ write_set_to_array(g_I, I_set);
+ write_set_to_array(g_J, J_set);
+
+ // if block for assembly does exist
+ if (I_set.size() > 0 && J_set.size() > 0)
+ {
+ viennacl::context ctx = viennacl::traits::context(A);
+ viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+ compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
+ std::vector<NumericT> con_A_I_J(sz_blocks, NumericT(0));
+
+ block_vector set_I_vcl, set_J_vcl;
+ //init memory on GPU
+ //contigious g_A_I_J
+ g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(NumericT)*(sz_blocks)),
+ &(con_A_I_J[0]));
+ g_A_I_J_vcl.handle().context(opencl_ctx);
+
+ //matrix_dimensions
+ g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
+ &(matrix_dims[0]));
+ g_A_I_J_vcl.handle1().context(opencl_ctx);
+
+ //start_block inds
+ g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+ &(blocks_ind[0]));
+ g_A_I_J_vcl.handle2().context(opencl_ctx);
+
+ //set_I
+ set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*sz_I),
+ &(I_set[0]));
+ set_I_vcl.handle().context(opencl_ctx);
+
+ //set_J
+ set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*sz_J),
+ &(J_set[0]));
+ set_J_vcl.handle().context(opencl_ctx);
+
+ //i_ind
+ set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+ &(i_ind[0]));
+ set_I_vcl.handle().context(opencl_ctx);
+
+ //j_ind
+ set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+ &(j_ind[0]));
+ set_J_vcl.handle().context(opencl_ctx);
+
+ viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
+ &(g_is_update[0]));
+
+ viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+ viennacl::ocl::kernel& assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "assemble_blocks");
+ assembly_kernel.local_work_size(0, 1);
+ assembly_kernel.global_work_size(0, 256);
+ viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(),
+ set_J_vcl.handle1(),
+ g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
+ g_is_update_vcl,
+ static_cast<unsigned int>(g_I.size())));
+ is_empty_block = false;
+ }
+ else
+ is_empty_block = true;
+}
+
+/************************************************************************************************************************/
+
+/** @brief Insertion of vectorized matrix column into original sparse matrix
+ *
+ * @param M_v column-major vectorized matrix
+ * @param M original sparse matrix
+ * @param is_right indicates if matrix should be transposed in the output
+ */
+template<typename SparseMatrixT, typename SparseVectorT>
+void insert_sparse_columns(std::vector<SparseVectorT> const & M_v,
+ SparseMatrixT& M,
+ bool is_right)
+{
+ if (is_right)
+ {
+ for (unsigned int i = 0; i < M_v.size(); ++i)
+ for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)
+ M(vec_it->first, i) = vec_it->second;
+ }
+ else //transposed fill of M
+ {
+ for (unsigned int i = 0; i < M_v.size(); ++i)
+ for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)
+ M(i, vec_it->first) = vec_it->second;
+ }
+}
+
+/** @brief Transposition of sparse matrix
+ *
+ * @param A_in intial sparse matrix
+ * @param A output transposed matrix
+ */
+template<typename MatrixT>
+void sparse_transpose(MatrixT const & A_in, MatrixT & A)
+{
+ typedef typename MatrixT::value_type NumericType;
+
+ std::vector<std::map<vcl_size_t, NumericType> > temp_A(A_in.size2());
+ A.resize(A_in.size2(), A_in.size1(), false);
+
+ for (typename MatrixT::const_iterator1 row_it = A_in.begin1();
+ row_it != A_in.end1();
+ ++row_it)
+ {
+ for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ {
+ temp_A[col_it.index2()][col_it.index1()] = *col_it;
+ }
+ }
+
+ for (vcl_size_t i=0; i<temp_A.size(); ++i)
+ {
+ for (typename std::map<vcl_size_t, NumericType>::const_iterator it = temp_A[i].begin();
+ it != temp_A[i].end();
+ ++it)
+ A(i, it->first) = it->second;
+ }
+}
+
+
+
+
+// template<typename SparseVectorType>
+// void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
+// for (int i = 0; i < l_M_v.size(); ++i){
+// l_M_v[i] = M_v[i + beg_ind];
+// }
+// }
+
+//CPU version
+/** @brief Construction of SPAI preconditioner on CPU
+ *
+ * @param A initial sparse matrix
+ * @param M output preconditioner
+ * @param tag spai tag
+ */
+template<typename MatrixT>
+void computeSPAI(MatrixT const & A,
+ MatrixT & M,
+ spai_tag & tag)
+{
+ typedef typename MatrixT::value_type NumericT;
+ typedef typename boost::numeric::ublas::vector<NumericT> VectorType;
+ typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT> SparseVectorType;
+ typedef typename boost::numeric::ublas::matrix<NumericT> DenseMatrixType;
+
+ //sparse matrix transpose...
+ unsigned int cur_iter = 0;
+ tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);
+ bool go_on = true;
+ std::vector<SparseVectorType> A_v_c(M.size2());
+ std::vector<SparseVectorType> M_v(M.size2());
+ vectorize_column_matrix(A, A_v_c);
+ vectorize_column_matrix(M, M_v);
+
+
+ while (go_on)
+ {
+ go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
+ cur_iter = 0;
+ unsigned int l_sz = static_cast<unsigned int>(tag.getEndInd() - tag.getBegInd());
+ //std::vector<bool> g_is_update(M.size2(), true);
+ std::vector<bool> g_is_update(l_sz, true);
+
+ //init is update
+ //init_parallel_is_update(g_is_update);
+ //std::vector<SparseVectorType> A_v_c(K);
+ //std::vector<SparseVectorType> M_v(K);
+ //vectorization of marices
+ //print_matrix(M_v);
+
+ std::vector<SparseVectorType> l_M_v(l_sz);
+ //custom_copy(M_v, l_M_v, beg_ind);
+ std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin());
+
+ //print_matrix(l_M_v);
+ //std::vector<SparseVectorType> l_A_v_c(K);
+ //custom_copy(A_v_c, l_A_v_c, beg_ind);
+ //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin());
+ //print_matrix(l_A_v_c);
+ //vectorize_row_matrix(A, A_v_r);
+ //working blocks
+
+ std::vector<DenseMatrixType> g_A_I_J(l_sz);
+ std::vector<VectorType> g_b_v(l_sz);
+ std::vector<SparseVectorType> g_res(l_sz);
+ std::vector<std::vector<unsigned int> > g_I(l_sz);
+ std::vector<std::vector<unsigned int> > g_J(l_sz);
+
+ while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
+ {
+ // SET UP THE BLOCKS..
+ // PHASE ONE
+ if (cur_iter == 0)
+ block_set_up(A, A_v_c, l_M_v, g_I, g_J, g_A_I_J, g_b_v);
+ else
+ block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);
+
+ //PHASE TWO, LEAST SQUARE SOLUTION
+ least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);
+
+ if (tag.getIsStatic()) break;
+ cur_iter++;
+ }
+
+ std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
+ tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
+ tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
+ //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
+ }
+
+ M.resize(M.size1(), M.size2(), false);
+ insert_sparse_columns(M_v, M, tag.getIsRight());
+}
+
+
+//GPU - based version
+/** @brief Construction of SPAI preconditioner on GPU
+ *
+ * @param A initial sparse matrix
+ * @param cpu_A copy of initial matrix on CPU
+ * @param cpu_M output preconditioner on CPU
+ * @param M output preconditioner
+ * @param tag SPAI tag class with parameters
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void computeSPAI(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, //input
+ boost::numeric::ublas::compressed_matrix<NumericT> const & cpu_A,
+ boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M, //output
+ viennacl::compressed_matrix<NumericT, AlignmentV> & M,
+ spai_tag const & tag)
+{
+ typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT> SparseVectorType;
+
+ //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;
+ //sparse matrix transpose...
+ unsigned int cur_iter = 0;
+ std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
+ //init is update
+ //init_parallel_is_update(g_is_update);
+ std::vector<SparseVectorType> A_v_c(cpu_M.size2());
+ std::vector<SparseVectorType> M_v(cpu_M.size2());
+ vectorize_column_matrix(cpu_A, A_v_c);
+ vectorize_column_matrix(cpu_M, M_v);
+ std::vector<SparseVectorType> g_res(cpu_M.size2());
+ std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
+ std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
+
+ //OpenCL variables
+ block_matrix g_A_I_J_vcl;
+ block_vector g_bv_vcl;
+ while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
+ {
+ // SET UP THE BLOCKS..
+ // PHASE ONE..
+ //timer.start();
+ //index set up on CPU
+ if (cur_iter == 0)
+ block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl);
+ else
+ block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag);
+ //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
+ //PERFORM LEAST SQUARE problems solution
+ //PHASE TWO
+ //timer.start();
+ least_square_solve<SparseVectorType, NumericT>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));
+ //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
+ if (tag.getIsStatic())
+ break;
+ cur_iter++;
+ }
+
+ cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
+ insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
+ //copy back to GPU
+ M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));
+ viennacl::copy(cpu_M, M);
+}
+
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
new file mode 100644
index 0000000..d8c718c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
@@ -0,0 +1,143 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail/spai/spai_tag.hpp
+ @brief Implementation of the spai tag holding SPAI configuration parameters. Experimental.
+
+ SPAI code contributed by Nikolay Lukash
+*/
+
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <cmath>
+#include <sstream>
+#include "viennacl/ocl/backend.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief A tag for SPAI
+ *
+ * Contains values for the algorithm.
+ * Must be passed to spai_precond constructor
+ */
+class spai_tag
+{
+ /** @brief Constructor
+ *
+ * @param residual_norm_threshold Calculate until the norm of the residual falls below this threshold
+ * @param iteration_limit maximum number of iterations
+ * @param residual_threshold determines starting threshold in residual vector for including new indices into set J
+ * @param is_static determines if static version of SPAI should be used
+ * @param is_right determines if left or right preconditioner should be used
+ */
+public:
+ spai_tag(double residual_norm_threshold = 1e-3,
+ unsigned int iteration_limit = 5,
+ double residual_threshold = 1e-2,
+ bool is_static = false,
+ bool is_right = false)
+ : residual_norm_threshold_(residual_norm_threshold),
+ iteration_limit_(iteration_limit),
+ residual_threshold_(residual_threshold),
+ is_static_(is_static),
+ is_right_(is_right) {}
+
+ double getResidualNormThreshold() const { return residual_norm_threshold_; }
+
+ double getResidualThreshold() const { return residual_threshold_; }
+
+ unsigned int getIterationLimit () const { return iteration_limit_; }
+
+ bool getIsStatic() const { return is_static_; }
+
+ bool getIsRight() const { return is_right_; }
+
+ long getBegInd() const { return beg_ind_; }
+
+ long getEndInd() const { return end_ind_; }
+
+
+
+ void setResidualNormThreshold(double residual_norm_threshold)
+ {
+ if (residual_norm_threshold > 0)
+ residual_norm_threshold_ = residual_norm_threshold;
+ }
+
+ void setResidualThreshold(double residual_threshold)
+ {
+ if (residual_threshold > 0)
+ residual_threshold_ = residual_threshold;
+ }
+
+ void setIterationLimit(unsigned int iteration_limit)
+ {
+ if (iteration_limit > 0)
+ iteration_limit_ = iteration_limit;
+ }
+
+ void setIsRight(bool is_right) { is_right_ = is_right; }
+
+ void setIsStatic(bool is_static) { is_static_ = is_static; }
+
+ void setBegInd(long beg_ind) { beg_ind_ = beg_ind; }
+
+ void setEndInd(long end_ind){ end_ind_ = end_ind; }
+
+
+private:
+ double residual_norm_threshold_;
+ unsigned int iteration_limit_;
+ long beg_ind_;
+ long end_ind_;
+ double residual_threshold_;
+ bool is_static_;
+ bool is_right_;
+};
+
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
new file mode 100644
index 0000000..c99eda1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/sparse_vector.hpp
+ @brief Implementation of a helper sparse vector class for SPAI. Experimental.
+
+ SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+ * @brief Represents a sparse vector based on std::map<unsigned int, NumericT>
+ */
+template<typename NumericT>
+class sparse_vector
+{
+public:
+ typedef typename std::map<unsigned int, NumericT>::iterator iterator;
+ typedef typename std::map<unsigned int, NumericT>::const_iterator const_iterator;
+
+ sparse_vector() {}
+
+ /** @brief Set the index of the vector in the original matrix
+ *
+ * May only be called once.
+ */
+ //getter
+ NumericT & operator[] (unsigned int ind) { return v_[ind]; }
+
+ void clear() { v_.clear(); }
+
+ const_iterator find(unsigned int var) const { return v_.find(var); }
+ iterator find(unsigned int var) { return v_.find(var); }
+
+ const_iterator begin() const { return v_.begin(); }
+ iterator begin() { return v_.begin(); }
+ const_iterator end() const { return v_.end(); }
+ iterator end() { return v_.end(); }
+
+private:
+ unsigned int size_;
+ std::map<unsigned int, NumericT> v_;
+};
+
+}
+}
+}
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
new file mode 100644
index 0000000..a3340d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
@@ -0,0 +1,580 @@
+#ifndef VIENNACL_LINALG_DIRECT_SOLVE_HPP_
+#define VIENNACL_LINALG_DIRECT_SOLVE_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/direct_solve.hpp
+ @brief Implementations of dense direct solvers are found here.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/host_based/direct_solve.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+ #include "viennacl/linalg/opencl/direct_solve.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ #include "viennacl/linalg/cuda/direct_solve.hpp"
+#endif
+
+#define VIENNACL_DIRECT_SOLVE_BLOCKSIZE 128
+
+namespace viennacl
+{
+namespace linalg
+{
+
+namespace detail
+{
+
+ //
+ // A \ B:
+ //
+
+ /** @brief Direct inplace solver for dense triangular systems using a single kernel launch. Matlab notation: A \ B
+ *
+ * @param A The system matrix
+ * @param B The matrix of row vectors, where the solution is directly written to
+ */
+ template<typename NumericT, typename SolverTagT>
+ void inplace_solve_kernel(const matrix_base<NumericT> & A, const matrix_base<NumericT> & B, SolverTagT)
+ {
+ assert( (viennacl::traits::size1(A) == viennacl::traits::size2(A)) && bool("Size check failed in inplace_solve(): size1(A) != size2(A)"));
+ assert( (viennacl::traits::size1(A) == viennacl::traits::size1(B)) && bool("Size check failed in inplace_solve(): size1(A) != size1(B)"));
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+ break;
+ #ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+ break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+ break;
+ #endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ //
+ // A \ b
+ //
+
+ template<typename NumericT, typename SolverTagT>
+ void inplace_solve_vec_kernel(const matrix_base<NumericT> & mat,
+ const vector_base<NumericT> & vec,
+ SolverTagT)
+ {
+ assert( (mat.size1() == vec.size()) && bool("Size check failed in inplace_solve(): size1(A) != size(b)"));
+ assert( (mat.size2() == vec.size()) && bool("Size check failed in inplace_solve(): size2(A) != size(b)"));
+
+ switch (viennacl::traits::handle(mat).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+ break;
+ #ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+ break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+ break;
+ #endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ template<typename MatrixT1, typename MatrixT2, typename SolverTagT>
+ void inplace_solve_lower_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT)
+ {
+ typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type NumericType;
+
+ vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+ if (A.size1() <= blockSize)
+ inplace_solve_kernel(A, B, SolverTagT());
+ else
+ {
+ for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize)
+ {
+ vcl_size_t Apos1 = i;
+ vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize);
+ vcl_size_t Bpos = B.size2();
+ inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)),
+ viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)),
+ SolverTagT());
+ if (Apos2 < A.size1())
+ {
+ viennacl::matrix_range<MatrixT2> B_lower(B, viennacl::range(Apos2, B.size1()), viennacl::range(0, Bpos));
+ viennacl::linalg::prod_impl(viennacl::project(A, viennacl::range(Apos2, A.size1()), viennacl::range(Apos1, Apos2)),
+ viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)),
+ B_lower,
+ NumericType(-1.0), NumericType(1.0));
+ }
+ }
+ }
+ }
+
+ template<typename MatrixT1, typename MatrixT2>
+ void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::lower_tag)
+ {
+ inplace_solve_lower_impl(A, B, viennacl::linalg::lower_tag());
+ }
+
+ template<typename MatrixT1, typename MatrixT2>
+ void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::unit_lower_tag)
+ {
+ inplace_solve_lower_impl(A, B, viennacl::linalg::unit_lower_tag());
+ }
+
+ template<typename MatrixT1, typename MatrixT2, typename SolverTagT>
+ void inplace_solve_upper_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT)
+ {
+ typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type NumericType;
+
+ int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+ if (static_cast<int>(A.size1()) <= blockSize)
+ inplace_solve_kernel(A, B, SolverTagT());
+ else
+ {
+ for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize)
+ {
+ vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize));
+ vcl_size_t Apos2 = vcl_size_t(i);
+ vcl_size_t Bpos = B.size2();
+ inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)),
+ viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)),
+ SolverTagT());
+ if (Apos1 > 0)
+ {
+ viennacl::matrix_range<MatrixT2> B_upper(B, viennacl::range(0, Apos1), viennacl::range(0, Bpos));
+
+ viennacl::linalg::prod_impl(viennacl::project(A, viennacl::range(0, Apos1), viennacl::range(Apos1, Apos2)),
+ viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)),
+ B_upper,
+ NumericType(-1.0), NumericType(1.0));
+ }
+ }
+ }
+ }
+
+ template<typename MatrixT1, typename MatrixT2>
+ void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::upper_tag)
+ {
+ inplace_solve_upper_impl(A, B, viennacl::linalg::upper_tag());
+ }
+
+ template<typename MatrixT1, typename MatrixT2>
+ void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::unit_upper_tag)
+ {
+ inplace_solve_upper_impl(A, B, viennacl::linalg::unit_upper_tag());
+ }
+
+} // namespace detail
+
+/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation)
+*
+* @param A The system matrix
+* @param B The matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & A,
+ matrix_base<NumericT> & B,
+ SolverTagT)
+{
+ detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+/** @brief Direct inplace solver for triangular systems with multiple transposed right hand sides, i.e. A \ B^T (MATLAB notation)
+*
+* @param A The system matrix
+* @param proxy_B The proxy for the transposed matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> proxy_B,
+ SolverTagT)
+{
+ typedef typename matrix_base<NumericT>::handle_type handle_type;
+
+ matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()),
+ proxy_B.lhs().size2(), proxy_B.lhs().start2(), proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(),
+ proxy_B.lhs().size1(), proxy_B.lhs().start1(), proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(),
+ !proxy_B.lhs().row_major());
+
+ detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+//upper triangular solver for transposed lower triangular matrices
+/** @brief Direct inplace solver for transposed triangular systems with multiple right hand sides, i.e. A^T \ B (MATLAB notation)
+*
+* @param proxy_A The transposed system matrix proxy
+* @param B The matrix holding the load vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_A,
+ matrix_base<NumericT> & B,
+ SolverTagT)
+{
+ typedef typename matrix_base<NumericT>::handle_type handle_type;
+
+ matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()),
+ proxy_A.lhs().size2(), proxy_A.lhs().start2(), proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(),
+ proxy_A.lhs().size1(), proxy_A.lhs().start1(), proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(),
+ !proxy_A.lhs().row_major());
+
+ detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+/** @brief Direct inplace solver for transposed triangular systems with multiple transposed right hand sides, i.e. A^T \ B^T (MATLAB notation)
+*
+* @param proxy_A The transposed system matrix proxy
+* @param proxy_B The transposed matrix holding the load vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> const & proxy_A,
+ matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> proxy_B,
+ SolverTagT)
+{
+ typedef typename matrix_base<NumericT>::handle_type handle_type;
+
+ matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()),
+ proxy_A.lhs().size2(), proxy_A.lhs().start2(), proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(),
+ proxy_A.lhs().size1(), proxy_A.lhs().start1(), proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(),
+ !proxy_A.lhs().row_major());
+
+ matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()),
+ proxy_B.lhs().size2(), proxy_B.lhs().start2(), proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(),
+ proxy_B.lhs().size1(), proxy_B.lhs().start1(), proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(),
+ !proxy_B.lhs().row_major());
+
+ detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+
+/////////////////// general wrappers for non-inplace solution //////////////////////
+
+
+/** @brief Convenience functions for C = solve(A, B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+*
+* @param A The system matrix
+* @param B The matrix of load vectors
+* @param tag Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_base<NumericT> & A,
+ const matrix_base<NumericT> & B,
+ SolverTagT tag)
+{
+ // do an inplace solve on the result vector:
+ matrix_base<NumericT> result(B);
+ inplace_solve(A, result, tag);
+ return result;
+}
+
+/** @brief Convenience functions for C = solve(A, B^T, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+*
+* @param A The system matrix
+* @param proxy The transposed load vector
+* @param tag Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_base<NumericT> & A,
+ const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
+ SolverTagT tag)
+{
+ // do an inplace solve on the result vector:
+ matrix_base<NumericT> result(proxy);
+ inplace_solve(A, result, tag);
+ return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve()
+*
+* @param proxy The transposed system matrix proxy
+* @param B The matrix of load vectors
+* @param tag Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
+ const matrix_base<NumericT> & B,
+ SolverTagT tag)
+{
+ // do an inplace solve on the result vector:
+ matrix_base<NumericT> result(B);
+ inplace_solve(proxy, result, tag);
+ return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param proxy_A The transposed system matrix proxy
+* @param proxy_B The transposed matrix of load vectors, where the solution is directly written to
+* @param tag Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_A,
+ const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_B,
+ SolverTagT tag)
+{
+ // run an inplace solve on the result vector:
+ matrix_base<NumericT> result(proxy_B);
+ inplace_solve(proxy_A, result, tag);
+ return result;
+}
+
+//
+/////////// solves with vector as right hand side ///////////////////
+//
+
+namespace detail
+{
+ template<typename MatrixT1, typename VectorT, typename SolverTagT>
+ void inplace_solve_lower_vec_impl(MatrixT1 const & A, VectorT & b, SolverTagT)
+ {
+ vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+ if (A.size1() <= blockSize)
+ inplace_solve_vec_kernel(A, b, SolverTagT());
+ else
+ {
+ VectorT temp(b);
+ for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize)
+ {
+ vcl_size_t Apos1 = i;
+ vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize);
+ inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)),
+ viennacl::project(b, viennacl::range(Apos1, Apos2)),
+ SolverTagT());
+ if (Apos2 < A.size1())
+ {
+ viennacl::project(temp, viennacl::range(Apos2, A.size1())) = viennacl::linalg::prod(viennacl::project(A, viennacl::range(Apos2, A.size1()), viennacl::range(Apos1, Apos2)),
+ viennacl::project(b, viennacl::range(Apos1, Apos2)));
+ viennacl::project(b, viennacl::range(Apos2, A.size1())) -= viennacl::project(temp, viennacl::range(Apos2, A.size1()));
+ }
+ }
+ }
+ }
+
+ template<typename MatrixT1, typename VectorT>
+ void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, viennacl::linalg::lower_tag)
+ {
+ inplace_solve_lower_vec_impl(A, B, viennacl::linalg::lower_tag());
+ }
+
+ template<typename MatrixT1, typename VectorT>
+ void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, viennacl::linalg::unit_lower_tag)
+ {
+ inplace_solve_lower_vec_impl(A, B, viennacl::linalg::unit_lower_tag());
+ }
+
+ template<typename MatrixT1, typename VectorT, typename SolverTagT>
+ void inplace_solve_upper_vec_impl(MatrixT1 const & A, VectorT & b, SolverTagT)
+ {
+ int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+ if (static_cast<int>(A.size1()) <= blockSize)
+ inplace_solve_vec_kernel(A, b, SolverTagT());
+ else
+ {
+ VectorT temp(b);
+ for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize)
+ {
+ vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize));
+ vcl_size_t Apos2 = vcl_size_t(i);
+ inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)),
+ viennacl::project(b, viennacl::range(Apos1, Apos2)),
+ SolverTagT());
+ if (Apos1 > 0)
+ {
+ viennacl::project(temp, viennacl::range(0, Apos1)) = viennacl::linalg::prod(viennacl::project(A, viennacl::range(0, Apos1), viennacl::range(Apos1, Apos2)),
+ viennacl::project(b, viennacl::range(Apos1, Apos2)));
+ viennacl::project(b, viennacl::range(0, Apos1)) -= viennacl::project(temp, viennacl::range(0, Apos1));
+ }
+ }
+ }
+ }
+
+ template<typename MatrixT1, typename VectorT>
+ void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, viennacl::linalg::upper_tag)
+ {
+ inplace_solve_upper_vec_impl(A, b, viennacl::linalg::upper_tag());
+ }
+
+ template<typename MatrixT1, typename VectorT>
+ void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, viennacl::linalg::unit_upper_tag)
+ {
+ inplace_solve_upper_vec_impl(A, b, viennacl::linalg::unit_upper_tag());
+ }
+
+} // namespace detail
+
+/** @brief Inplace solution of a triangular system. Matlab notation A \ b.
+*
+* @param mat The system matrix (a dense matrix for which only the respective triangular form is used)
+* @param vec The right hand side vector
+* @param tag The tag (either lower_tag, unit_lower_tag, upper_tag, or unit_upper_tag)
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & mat,
+ vector_base<NumericT> & vec,
+ SolverTagT const & tag)
+{
+
+ detail::inplace_solve_vec_impl(mat, vec, tag);
+}
+
+/** @brief Inplace solution of a triangular system with transposed system matrix.. Matlab notation A' \ b.
+*
+* @param proxy The transposed system matrix (a dense matrix for which only the respective triangular form is used)
+* @param vec The right hand side vector
+* @param tag The tag (either lower_tag, unit_lower_tag, upper_tag, or unit_upper_tag)
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> const & proxy,
+ vector_base<NumericT> & vec,
+ SolverTagT const & tag)
+{
+ typedef typename matrix_base<NumericT>::handle_type handle_type;
+
+ // wrap existing matrix in a new matrix_base object (no data copy)
+ matrix_base<NumericT> mat(const_cast<handle_type &>(proxy.lhs().handle()),
+ proxy.lhs().size2(), proxy.lhs().start2(), proxy.lhs().stride2(), proxy.lhs().internal_size2(),
+ proxy.lhs().size1(), proxy.lhs().start1(), proxy.lhs().stride1(), proxy.lhs().internal_size1(),
+ !proxy.lhs().row_major());
+ detail::inplace_solve_vec_impl(mat, vec, tag);
+}
+
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for an upper triangular solve.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat The system matrix
+* @param vec The load vector
+* @param tag Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+ const vector_base<NumericT> & vec,
+ viennacl::linalg::upper_tag const & tag)
+{
+ // run an inplace solve on the result vector:
+ vector<NumericT> result(vec);
+ inplace_solve(mat, result, tag);
+ return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for an upper triangular solve with unit diagonal.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat The system matrix
+* @param vec The load vector
+* @param tag Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+ const vector_base<NumericT> & vec,
+ viennacl::linalg::unit_upper_tag const & tag)
+{
+ // run an inplace solve on the result vector:
+ vector<NumericT> result(vec);
+ inplace_solve(mat, result, tag);
+ return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for a lower triangular solve.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat The system matrix
+* @param vec The load vector
+* @param tag Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+ const vector_base<NumericT> & vec,
+ viennacl::linalg::lower_tag const & tag)
+{
+ // run an inplace solve on the result vector:
+ vector<NumericT> result(vec);
+ inplace_solve(mat, result, tag);
+ return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for a lower triangular solve with unit diagonal.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat The system matrix
+* @param vec The load vector
+* @param tag Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+ const vector_base<NumericT> & vec,
+ viennacl::linalg::unit_lower_tag const & tag)
+{
+ // run an inplace solve on the result vector:
+ vector<NumericT> result(vec);
+ inplace_solve(mat, result, tag);
+ return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param proxy The transposed system matrix proxy
+* @param vec The load vector, where the solution is directly written to
+* @param tag Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+vector<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
+ const vector_base<NumericT> & vec,
+ SolverTagT const & tag)
+{
+ // run an inplace solve on the result vector:
+ vector<NumericT> result(vec);
+ inplace_solve(proxy, result, tag);
+ return result;
+}
+
+
+}
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
new file mode 100644
index 0000000..36be3b3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
@@ -0,0 +1,29 @@
+#ifndef VIENNACL_LINALG_EIG_HPP_
+#define VIENNACL_LINALG_EIG_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/eig.hpp
+* @brief Convenience header file including all available eigenvalue algorithms
+*/
+
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/linalg/lanczos.hpp"
+#include "viennacl/linalg/power_iter.hpp"
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
new file mode 100644
index 0000000..ae9ade2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
@@ -0,0 +1,481 @@
+#ifndef VIENNACL_LINALG_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation.
+ */
+
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/linalg/opencl/fft_operations.hpp"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/linalg/cuda/fft_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+ viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t size, vcl_size_t stride,
+ vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, sign, data_order);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::direct(in, out, size, stride, batch_num,sign,data_order);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+
+ }
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& out, vcl_size_t size,
+ vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, sign, data_order);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::direct(in, out, size, stride, batch_num,sign,data_order);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+
+ }
+}
+
+/*
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+ vcl_size_t bits_datasize, vcl_size_t batch_num,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::reorder(in, size, stride, bits_datasize, batch_num, data_order);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::reorder<NumericT>(viennacl::traits::opencl_handle(in), size, stride, bits_datasize, batch_num, data_order);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::reorder(in, size, stride, bits_datasize, batch_num, data_order);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+
+ }
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & in, vcl_size_t size,
+ vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, data_order);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), size, stride, batch_num, sign,data_order);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::radix2(in, size, stride, batch_num, sign, data_order);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+ vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, data_order);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), size, stride, batch_num, sign,data_order);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::radix2(in, size, stride, batch_num, sign,data_order);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently, Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
+ viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/)
+{
+
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::bluestein(in, out, 1);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::bluestein(in, out, 1);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::bluestein(in, out, 1);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+ viennacl::vector<NumericT, AlignmentV> const & input2,
+ viennacl::vector<NumericT, AlignmentV> & output)
+{
+ switch (viennacl::traits::handle(input1).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::multiply_complex(input1, input2, output);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::multiply_complex(input1, input2, output);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::multiply_complex(input1, input2, output);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+ switch (viennacl::traits::handle(input).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::normalize(input);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::normalize(input);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::normalize(input);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+ switch (viennacl::traits::handle(input).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::transpose(input);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::transpose(input);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::transpose(input);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+ switch (viennacl::traits::handle(input).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::transpose(input, output);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::transpose(input, output);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::transpose(input, output);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::real_to_complex(in, out, size);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::real_to_complex(in,out,size);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::real_to_complex(in,out,size);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::complex_to_real(in, out, size);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::complex_to_real(in, out, size);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::complex_to_real(in, out, size);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT> & in)
+{
+ switch (viennacl::traits::handle(in).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::reverse(in);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::reverse(in);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::reverse(in);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+}
+}
+
+#endif /* FFT_OPERATIONS_HPP_ */
[33/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
new file mode 100644
index 0000000..24cb4a6
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
@@ -0,0 +1,2887 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/matrix_operations_prod.hpp
+ @brief Dense matrix-matrix product CUDA kernels reside here.
+
+ Note: File created semi-automatically from OpenCL kernels.
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...col_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_col_prod_AA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+ vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+ vcl_size_t bStep = block_size * B_row_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...col_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_col_prod_AT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+ vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+ vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...col_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_col_prod_TA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+ vcl_size_t aStep = block_size * A_row_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+ vcl_size_t bStep = block_size * B_row_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...col_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_col_prod_TT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+ vcl_size_t aStep = block_size * A_row_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+ vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...row_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_col_prod_AA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+ vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+ vcl_size_t bStep = block_size * B_row_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...row_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_col_prod_AT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+ vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+ vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...row_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_col_prod_TA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+ vcl_size_t aStep = block_size * A_row_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+ vcl_size_t bStep = block_size * B_row_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...row_major, A...col_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_col_prod_TT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+ vcl_size_t aStep = block_size * A_row_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+ vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+
+
+
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...col_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_row_prod_AA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+ vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+ vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...col_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_row_prod_AT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+ vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+ vcl_size_t bStep = block_size * B_col_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...col_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_row_prod_TA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+ vcl_size_t aStep = block_size * A_row_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+ vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...col_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_col_row_prod_TT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+ vcl_size_t aStep = block_size * A_row_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+ vcl_size_t bStep = block_size * B_col_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...row_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_row_prod_AA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+ vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+ vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...row_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_row_prod_AT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
+ vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+ vcl_size_t bStep = block_size * B_col_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...row_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_row_prod_TA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+ vcl_size_t aStep = block_size * A_row_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
+ vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...row_major, A...col_major, B...row_major
+template<typename NumericT>
+__global__ void matrix_matrix_row_col_row_prod_TT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
+ vcl_size_t aStep = block_size * A_row_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
+ vcl_size_t bStep = block_size * B_col_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
+ vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
+}
+
+
+
+
+
+////////////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+// matrix-matrix multiplication C = A * B
+// matrix layouts: C...col_major, A...row_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_row_col_prod_AA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+ vcl_size_t aStep = block_size * A_col_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+ vcl_size_t bStep = block_size * B_row_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A * B^T
+// matrix layouts: C...col_major, A...row_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_row_col_prod_AT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
+ vcl_size_t aStep = block_size * A_col_inc;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+ vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+ vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
+ bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B
+// matrix layouts: C...col_major, A...row_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_row_col_prod_TA_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+ vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+ vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
+ vcl_size_t bStep = block_size * B_row_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
+ NumericT Csub = 0;
+ vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
+ vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows;
+
+ vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
+ vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
+ for (vcl_size_t block = 0;
+ block < block_num;
+ ++block)
+ {
+ bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
+ bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
+ __syncthreads();
+ NumericT * bufAptr = bufA + row_thread_id_times_block_size;
+ NumericT * bufBptr = bufB + col_thread_id_times_block_size;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
+ __syncthreads();
+ aBegin += aStep;
+ bBegin += bStep;
+ }
+ if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
+ C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
+}
+
+// matrix-matrix multiplication C = A^T * B^T
+// matrix layouts: C...col_major, A...row_major, B...col_major
+template<typename NumericT>
+__global__ void matrix_matrix_col_row_col_prod_TT_kernel(
+ NumericT alpha,
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * B,
+ unsigned int B_row_start,
+ unsigned int B_col_start,
+ unsigned int B_row_inc,
+ unsigned int B_col_inc,
+ unsigned int B_row_size,
+ unsigned int B_col_size,
+ unsigned int B_internal_rows,
+ unsigned int B_internal_cols,
+ NumericT beta,
+ NumericT * C,
+ unsigned int C_row_start,
+ unsigned int C_col_start,
+ unsigned int C_row_inc,
+ unsigned int C_col_inc,
+ unsigned int C_row_size,
+ unsigned int C_col_size,
+ unsigned int C_internal_rows,
+ unsigned int C_internal_cols)
+{
+
+ __shared__ NumericT bufA[272];
+ __shared__ NumericT bufB[272];
+
+ vcl_size_t block_size = 16;//get_local_size(0);
+ vcl_size_t row_block_id = blockIdx.x;
+ vcl_size_t col_block_id = blockIdx.y;
+ vcl_size_t row_thread_id = threadIdx.x;
+ vcl_size_t col_thread_id = threadIdx.y;
+ vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
+ vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
+ vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
+ vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
+ vcl_size_t block_num = (A_row_size + block_size - 1) /
<TRUNCATED>
[39/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
new file mode 100644
index 0000000..65b323e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
@@ -0,0 +1,821 @@
+#ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
+#define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file cuda/amg_operations.hpp
+ @brief Implementations of routines for AMG in OpenCL.
+*/
+
+#include <cstdlib>
+#include <cmath>
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+
+#include <map>
+#include <set>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace amg
+{
+
+
+///////////////////////////////////////////
+
+__global__ void amg_influence_trivial_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ unsigned int size1,
+ unsigned int nnz,
+ unsigned int *influences_row,
+ unsigned int *influences_id,
+ unsigned int *influences_values
+ )
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int i = global_id; i < size1; i += global_size)
+ {
+ unsigned int tmp = row_indices[i];
+ influences_row[i] = tmp;
+ influences_values[i] = row_indices[i+1] - tmp;
+ }
+
+ for (unsigned int i = global_id; i < nnz; i += global_size)
+ influences_id[i] = column_indices[i];
+
+ if (global_id == 0)
+ influences_row[size1] = row_indices[size1];
+}
+
+
+/** @brief Routine for taking all connections in the matrix as strong */
+template<typename NumericT>
+void amg_influence_trivial(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+
+ amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+ static_cast<unsigned int>(A.size1()),
+ static_cast<unsigned int>(A.nnz()),
+ viennacl::cuda_arg(amg_context.influence_jumper_),
+ viennacl::cuda_arg(amg_context.influence_ids_),
+ viennacl::cuda_arg(amg_context.influence_values_)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_influence_trivial_kernel");
+}
+
+
+/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */
+template<typename NumericT>
+void amg_influence_advanced(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ throw std::runtime_error("not implemented yet");
+}
+
+/** @brief Dispatcher for influence processing */
+template<typename NumericT>
+void amg_influence(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ // TODO: dispatch based on influence tolerance provided
+ amg_influence_trivial(A, amg_context, tag);
+}
+
+/** @brief Assign IDs to coarse points.
+*
+* TODO: Use exclusive_scan on GPU for this.
+*/
+inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context)
+{
+ viennacl::backend::typesafe_host_array<unsigned int> point_types(amg_context.point_types_.handle(), amg_context.point_types_.size());
+ viennacl::backend::typesafe_host_array<unsigned int> coarse_ids(amg_context.coarse_id_.handle(), amg_context.coarse_id_.size());
+ viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get());
+ viennacl::backend::memory_read(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
+
+ unsigned int coarse_id = 0;
+ for (std::size_t i=0; i<amg_context.point_types_.size(); ++i)
+ {
+ coarse_ids.set(i, coarse_id);
+ if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ ++coarse_id;
+ }
+
+ amg_context.num_coarse_ = coarse_id;
+
+ viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
+}
+
+//////////////////////////////////////
+
+/** @brief CUDA kernel initializing the work vectors at each PMIS iteration */
+template<typename IndexT>
+__global__ void amg_pmis2_init_workdata(IndexT *work_state,
+ IndexT *work_random,
+ IndexT *work_index,
+ IndexT const *point_types,
+ IndexT const *random_weights,
+ unsigned int size)
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int i = global_id; i < size; i += global_size)
+ {
+ switch (point_types[i])
+ {
+ case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: work_state[i] = 1; break;
+ case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE: work_state[i] = 0; break;
+ case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE: work_state[i] = 2; break;
+ default:
+ break; // do nothing
+ }
+
+ work_random[i] = random_weights[i];
+ work_index[i] = i;
+ }
+}
+
+/** @brief CUDA kernel propagating the state triple (status, weight, nodeID) to neighbors using a max()-operation */
+template<typename IndexT>
+__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state,
+ IndexT const *work_random,
+ IndexT const *work_index,
+ IndexT *work_state2,
+ IndexT *work_random2,
+ IndexT *work_index2,
+ IndexT const *influences_row,
+ IndexT const *influences_id,
+ unsigned int size)
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int i = global_id; i < size; i += global_size)
+ {
+ // load
+ unsigned int state = work_state[i];
+ unsigned int random = work_random[i];
+ unsigned int index = work_index[i];
+
+ // max
+ unsigned int j_stop = influences_row[i + 1];
+ for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id[j];
+
+ // lexigraphical triple-max (not particularly pretty, but does the job):
+ if (state < work_state[influenced_point_id])
+ {
+ state = work_state[influenced_point_id];
+ random = work_random[influenced_point_id];
+ index = work_index[influenced_point_id];
+ }
+ else if (state == work_state[influenced_point_id])
+ {
+ if (random < work_random[influenced_point_id])
+ {
+ state = work_state[influenced_point_id];
+ random = work_random[influenced_point_id];
+ index = work_index[influenced_point_id];
+ }
+ else if (random == work_random[influenced_point_id])
+ {
+ if (index < work_index[influenced_point_id])
+ {
+ state = work_state[influenced_point_id];
+ random = work_random[influenced_point_id];
+ index = work_index[influenced_point_id];
+ }
+ } // max(random)
+ } // max(state)
+ } // for
+
+ // store
+ work_state2[i] = state;
+ work_random2[i] = random;
+ work_index2[i] = index;
+ }
+}
+
+/** @brief CUDA kernel for marking MIS and non-MIS nodes */
+template<typename IndexT>
+__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state,
+ IndexT const *work_index,
+ IndexT *point_types,
+ IndexT *undecided_buffer,
+ unsigned int size)
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ unsigned int num_undecided = 0;
+ for (unsigned int i = global_id; i < size; i += global_size)
+ {
+ unsigned int max_state = work_state[i];
+ unsigned int max_index = work_index[i];
+
+ if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ {
+ if (i == max_index) // make this a MIS node
+ point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+ else if (max_state == 2) // mind the mapping of viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above!
+ point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+ else
+ num_undecided += 1;
+ }
+ }
+
+ // reduction of the number of undecided nodes:
+ __shared__ unsigned int shared_buffer[256];
+ shared_buffer[threadIdx.x] = num_undecided;
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+stride];
+ }
+
+ if (threadIdx.x == 0)
+ undecided_buffer[blockIdx.x] = shared_buffer[0];
+
+}
+
+/** @brief CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided so that subsequent kernels work */
+__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int size)
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int i = global_id; i < size; i += global_size)
+ {
+ if (point_types[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED;
+ }
+}
+
+/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1
+*
+* @param A Operator matrix on all levels
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ viennacl::vector<unsigned int> random_weights(A.size1(), viennacl::context(viennacl::MAIN_MEMORY));
+ unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle());
+ for (std::size_t i=0; i<random_weights.size(); ++i)
+ random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1());
+ random_weights.switch_memory_context(viennacl::traits::context(A));
+
+ // work vectors:
+ viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A));
+
+ viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A));
+
+ unsigned int num_undecided = static_cast<unsigned int>(A.size1());
+ viennacl::vector<unsigned int> undecided_buffer(256, viennacl::traits::context(A));
+ viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size());
+
+ unsigned int pmis_iters = 0;
+ while (num_undecided > 0)
+ {
+ ++pmis_iters;
+
+ //
+ // init temporary work data:
+ //
+ amg_pmis2_init_workdata<<<128, 128>>>(viennacl::cuda_arg(work_state),
+ viennacl::cuda_arg(work_random),
+ viennacl::cuda_arg(work_index),
+ viennacl::cuda_arg(amg_context.point_types_),
+ viennacl::cuda_arg(random_weights),
+ static_cast<unsigned int>(A.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+
+
+ //
+ // Propagate maximum tuple twice
+ //
+ for (unsigned int r = 0; r < 2; ++r)
+ {
+ // max operation over neighborhood
+ amg_pmis2_max_neighborhood<<<128, 128>>>(viennacl::cuda_arg(work_state),
+ viennacl::cuda_arg(work_random),
+ viennacl::cuda_arg(work_index),
+ viennacl::cuda_arg(work_state2),
+ viennacl::cuda_arg(work_random2),
+ viennacl::cuda_arg(work_index2),
+ viennacl::cuda_arg(amg_context.influence_jumper_),
+ viennacl::cuda_arg(amg_context.influence_ids_),
+ static_cast<unsigned int>(A.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_max_neighborhood");
+
+ // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy)
+ work_state = work_state2;
+ work_random = work_random2;
+ work_index = work_index2;
+ }
+
+ //
+ // mark MIS and non-MIS nodes:
+ //
+ amg_pmis2_mark_mis_nodes<<<128, 128>>>(viennacl::cuda_arg(work_state),
+ viennacl::cuda_arg(work_index),
+ viennacl::cuda_arg(amg_context.point_types_),
+ viennacl::cuda_arg(undecided_buffer),
+ static_cast<unsigned int>(A.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+
+ // get number of undecided points on host:
+ viennacl::backend::memory_read(undecided_buffer.handle(), 0, undecided_buffer_host.raw_size(), undecided_buffer_host.get());
+ num_undecided = 0;
+ for (std::size_t i=0; i<undecided_buffer.size(); ++i)
+ num_undecided += undecided_buffer_host[i];
+
+ } //while
+
+ // consistency with sequential MIS: reset state for non-coarse points, so that coarse indices are correctly picked up later
+ amg_pmis2_reset_state<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
+ static_cast<unsigned int>(amg_context.point_types_.size())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+}
+
+
+
+
+
+template<typename IndexT>
+__global__ void amg_agg_propagate_coarse_indices(IndexT *point_types,
+ IndexT *coarse_ids,
+ IndexT const *influences_row,
+ IndexT const *influences_id,
+ unsigned int size)
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int i = global_id; i < size; i += global_size)
+ {
+ if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ {
+ unsigned int coarse_index = coarse_ids[i];
+
+ unsigned int j_stop = influences_row[i + 1];
+ for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id[j];
+ coarse_ids[influenced_point_id] = coarse_index; // Set aggregate index for fine point
+
+ if (influenced_point_id != i) // Note: Any write races between threads are harmless here
+ point_types[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+ }
+ }
+ }
+}
+
+
+template<typename IndexT>
+__global__ void amg_agg_merge_undecided(IndexT *point_types,
+ IndexT *coarse_ids,
+ IndexT const *influences_row,
+ IndexT const *influences_id,
+ unsigned int size)
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int i = global_id; i < size; i += global_size)
+ {
+ if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ {
+ unsigned int j_stop = influences_row[i + 1];
+ for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+ {
+ unsigned int influenced_point_id = influences_id[j];
+ if (point_types[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point
+ {
+ //std::cout << "Setting fine node " << i << " to be aggregated with node " << *influence_iter << "/" << pointvector.get_coarse_index(*influence_iter) << std::endl;
+ coarse_ids[i] = coarse_ids[influenced_point_id];
+ break;
+ }
+ }
+ }
+ }
+}
+
+
+__global__ void amg_agg_merge_undecided_2(unsigned int *point_types,
+ unsigned int size)
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int i = global_id; i < size; i += global_size)
+ {
+ if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+ point_types[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+ }
+}
+
+
+/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG)
+*
+* @param A Operator matrix
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+
+ amg_influence_trivial(A, amg_context, tag);
+
+ //
+ // Stage 1: Build aggregates:
+ //
+ if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION)
+ amg_coarse_ag_stage1_mis2(A, amg_context, tag);
+ else
+ throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with CUDA backend!");
+
+ viennacl::linalg::cuda::amg::enumerate_coarse_points(amg_context);
+
+ //
+ // Stage 2: Propagate coarse aggregate indices to neighbors:
+ //
+ amg_agg_propagate_coarse_indices<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
+ viennacl::cuda_arg(amg_context.coarse_id_),
+ viennacl::cuda_arg(amg_context.influence_jumper_),
+ viennacl::cuda_arg(amg_context.influence_ids_),
+ static_cast<unsigned int>(A.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_propagate_coarse_indices");
+
+
+ //
+ // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy
+ //
+ amg_agg_merge_undecided<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
+ viennacl::cuda_arg(amg_context.coarse_id_),
+ viennacl::cuda_arg(amg_context.influence_jumper_),
+ viennacl::cuda_arg(amg_context.influence_ids_),
+ static_cast<unsigned int>(A.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided");
+
+ //
+ // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3)
+ // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution).
+ //
+ amg_agg_merge_undecided_2<<<128, 128>>>(viennacl::cuda_arg(amg_context.point_types_),
+ static_cast<unsigned int>(A.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided_2");
+}
+
+
+
+
+/** @brief Calls the right coarsening procedure
+*
+* @param A Operator matrix on all levels
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename InternalT1>
+void amg_coarse(InternalT1 & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ switch (tag.get_coarsening_method())
+ {
+ case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break;
+ default: throw std::runtime_error("not implemented yet");
+ }
+}
+
+
+
+
+////////////////////////////////////// Interpolation /////////////////////////////
+
+template<typename NumericT>
+__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer,
+ unsigned int *P_col_buffer,
+ NumericT *P_elements,
+ unsigned int *coarse_ids,
+ unsigned int size)
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int i = global_id; i < size; i += global_size)
+ {
+ P_row_buffer[i] = i;
+ P_col_buffer[i] = coarse_ids[i];
+ P_elements[i] = NumericT(1);
+ }
+
+ // set last entry as well:
+ if (global_id == 0)
+ P_row_buffer[size] = size;
+}
+
+/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_ag(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+ P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+ amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(P.handle1().cuda_handle()),
+ viennacl::cuda_arg<unsigned int>(P.handle2().cuda_handle()),
+ viennacl::cuda_arg<NumericT>(P.handle().cuda_handle()),
+ viennacl::cuda_arg(amg_context.coarse_id_),
+ static_cast<unsigned int>(A.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_ag_kernel");
+
+ P.generate_row_block_information();
+}
+
+
+
+template<typename NumericT>
+__global__ void amg_interpol_sa_kernel(
+ const unsigned int *A_row_indices,
+ const unsigned int *A_col_indices,
+ const NumericT *A_elements,
+ unsigned int A_size1,
+ unsigned int A_nnz,
+ unsigned int *Jacobi_row_indices,
+ unsigned int *Jacobi_col_indices,
+ NumericT *Jacobi_elements,
+ NumericT omega
+ )
+{
+ unsigned int global_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int global_size = gridDim.x * blockDim.x;
+
+ for (unsigned int row = global_id; row < A_size1; row += global_size)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ Jacobi_row_indices[row] = row_begin;
+
+ // Step 1: Extract diagonal:
+ NumericT diag = 0;
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ if (A_col_indices[j] == row)
+ {
+ diag = A_elements[j];
+ break;
+ }
+ }
+
+ // Step 2: Write entries:
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col_index = A_col_indices[j];
+ Jacobi_col_indices[j] = col_index;
+
+ if (col_index == row)
+ Jacobi_elements[j] = NumericT(1) - omega;
+ else
+ Jacobi_elements[j] = - omega * A_elements[j] / diag;
+ }
+ }
+
+ if (global_id == 0)
+ Jacobi_row_indices[A_size1] = A_nnz; // don't forget finalizer
+}
+
+
+
+/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_sa(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+ viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+ // form tentative operator:
+ amg_interpol_ag(A, P_tentative, amg_context, tag);
+
+ viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+ amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+ viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+ static_cast<unsigned int>(A.size1()),
+ static_cast<unsigned int>(A.nnz()),
+ viennacl::cuda_arg<unsigned int>(Jacobi.handle1().cuda_handle()),
+ viennacl::cuda_arg<unsigned int>(Jacobi.handle2().cuda_handle()),
+ viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()),
+ NumericT(tag.get_jacobi_weight())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_sa_kernel");
+
+ P = viennacl::linalg::prod(Jacobi, P_tentative);
+
+ P.generate_row_block_information();
+}
+
+
+/** @brief Dispatcher for building the interpolation matrix
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename MatrixT>
+void amg_interpol(MatrixT const & A,
+ MatrixT & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ switch (tag.get_interpolation_method())
+ {
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break;
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa (A, P, amg_context, tag); break;
+ default: throw std::runtime_error("Not implemented yet!");
+ }
+}
+
+
+template<typename NumericT>
+__global__ void compressed_matrix_assign_to_dense(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT *B,
+ unsigned int B_row_start, unsigned int B_col_start,
+ unsigned int B_row_inc, unsigned int B_col_inc,
+ unsigned int B_row_size, unsigned int B_col_size,
+ unsigned int B_internal_rows, unsigned int B_internal_cols)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < B_row_size;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_end = row_indices[row+1];
+ for (unsigned int j = row_indices[row]; j<row_end; j++)
+ B[(B_row_start + row * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j];
+ }
+}
+
+
+template<typename NumericT, unsigned int AlignmentV>
+void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ viennacl::matrix_base<NumericT> & B)
+{
+ compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+ viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+ viennacl::cuda_arg<NumericT>(B),
+ static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
+ static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
+ static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_assign_to_dense");
+}
+
+
+
+
+template<typename NumericT>
+__global__ void compressed_matrix_smooth_jacobi_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT weight,
+ const NumericT * x_old,
+ NumericT * x_new,
+ const NumericT * rhs,
+ unsigned int size)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < size;
+ row += gridDim.x * blockDim.x)
+ {
+ NumericT sum = NumericT(0);
+ NumericT diag = NumericT(1);
+ unsigned int row_end = row_indices[row+1];
+ for (unsigned int j = row_indices[row]; j < row_end; ++j)
+ {
+ unsigned int col = column_indices[j];
+ if (col == row)
+ diag = elements[j];
+ else
+ sum += elements[j] * x_old[col];
+ }
+ x_new[row] = weight * (rhs[row] - sum) / diag + (NumericT(1) - weight) * x_old[row];
+ }
+}
+
+
+
+
+/** @brief Damped Jacobi Smoother (CUDA version)
+*
+* @param iterations Number of smoother iterations
+* @param A Operator matrix for the smoothing
+* @param x The vector smoothing is applied to
+* @param x_backup (Different) Vector holding the same values as x
+* @param rhs_smooth The right hand side of the equation for the smoother
+* @param weight Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration
+*/
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+ compressed_matrix<NumericT> const & A,
+ vector<NumericT> & x,
+ vector<NumericT> & x_backup,
+ vector<NumericT> const & rhs_smooth,
+ NumericT weight)
+{
+ for (unsigned int i=0; i<iterations; ++i)
+ {
+ x_backup = x;
+
+ compressed_matrix_smooth_jacobi_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+ viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+ static_cast<NumericT>(weight),
+ viennacl::cuda_arg(x_backup),
+ viennacl::cuda_arg(x),
+ viennacl::cuda_arg(rhs_smooth),
+ static_cast<unsigned int>(rhs_smooth.size())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_smooth_jacobi_kernel");
+ }
+}
+
+
+} //namespace amg
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..39f0015
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
@@ -0,0 +1,166 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_calls.hpp
+ @brief CUDA kernel calls for the bisection algorithm
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+
+// includes, kernels
+#include "viennacl/linalg/cuda/bisect_kernel_small.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large_onei.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large_multi.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+template<typename NumericT>
+void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataSmall<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+{
+
+
+ dim3 blocks(1, 1, 1);
+ dim3 threads(VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1);
+
+ bisectKernelSmall<<< blocks, threads >>>(
+ viennacl::cuda_arg(input.g_a),
+ viennacl::cuda_arg(input.g_b) + 1,
+ mat_size,
+ viennacl::cuda_arg(result.vcl_g_left),
+ viennacl::cuda_arg(result.vcl_g_right),
+ viennacl::cuda_arg(result.vcl_g_left_count),
+ viennacl::cuda_arg(result.vcl_g_right_count),
+ lg, ug, 0, mat_size,
+ precision
+ );
+ viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch failed");
+}
+
+
+template<typename NumericT>
+void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+ {
+
+ dim3 blocks(1, 1, 1);
+ dim3 threads(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2 , 1, 1);
+ bisectKernelLarge<<< blocks, threads >>>
+ (viennacl::cuda_arg(input.g_a),
+ viennacl::cuda_arg(input.g_b) + 1,
+ mat_size,
+ lg, ug, static_cast<unsigned int>(0), mat_size, precision,
+ viennacl::cuda_arg(result.g_num_one),
+ viennacl::cuda_arg(result.g_num_blocks_mult),
+ viennacl::cuda_arg(result.g_left_one),
+ viennacl::cuda_arg(result.g_right_one),
+ viennacl::cuda_arg(result.g_pos_one),
+ viennacl::cuda_arg(result.g_left_mult),
+ viennacl::cuda_arg(result.g_right_mult),
+ viennacl::cuda_arg(result.g_left_count_mult),
+ viennacl::cuda_arg(result.g_right_count_mult),
+ viennacl::cuda_arg(result.g_blocks_mult),
+ viennacl::cuda_arg(result.g_blocks_mult_sum)
+ );
+ viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch failed.");
+}
+
+
+// compute eigenvalues for intervals that contained only one eigenvalue
+// after the first processing step
+template<typename NumericT>
+void bisectLarge_OneIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT precision)
+ {
+
+ unsigned int num_one_intervals = result.g_num_one;
+ unsigned int num_blocks = viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals,
+ mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+ dim3 grid_onei;
+ grid_onei.x = num_blocks;
+ grid_onei.y = 1, grid_onei.z = 1;
+ dim3 threads_onei(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1);
+
+
+ bisectKernelLarge_OneIntervals<<< grid_onei , threads_onei >>>
+ (viennacl::cuda_arg(input.g_a),
+ viennacl::cuda_arg(input.g_b) + 1,
+ mat_size, num_one_intervals,
+ viennacl::cuda_arg(result.g_left_one),
+ viennacl::cuda_arg(result.g_right_one),
+ viennacl::cuda_arg(result.g_pos_one),
+ precision
+ );
+ viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_OneIntervals() FAILED.");
+}
+
+
+// process intervals that contained more than one eigenvalue after
+// the first processing step
+template<typename NumericT>
+void bisectLarge_MultIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT precision)
+ {
+ // get the number of blocks of intervals that contain, in total when
+ // each interval contains only one eigenvalue, not more than
+ // MAX_THREADS_BLOCK threads
+ unsigned int num_blocks_mult = result.g_num_blocks_mult;
+
+ // setup the execution environment
+ dim3 grid_mult(num_blocks_mult, 1, 1);
+ dim3 threads_mult(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1);
+
+ bisectKernelLarge_MultIntervals<<< grid_mult, threads_mult >>>
+ (viennacl::cuda_arg(input.g_a),
+ viennacl::cuda_arg(input.g_b) + 1,
+ mat_size,
+ viennacl::cuda_arg(result.g_blocks_mult),
+ viennacl::cuda_arg(result.g_blocks_mult_sum),
+ viennacl::cuda_arg(result.g_left_mult),
+ viennacl::cuda_arg(result.g_right_mult),
+ viennacl::cuda_arg(result.g_left_count_mult),
+ viennacl::cuda_arg(result.g_right_count_mult),
+ viennacl::cuda_arg(result.g_lambda_mult),
+ viennacl::cuda_arg(result.g_pos_mult),
+ precision
+ );
+ viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_MultIntervals() FAILED.");
+}
+}
+}
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
new file mode 100755
index 0000000..77c9773
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
@@ -0,0 +1,928 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large.hpp
+ @brief First step of the bisection algorithm for the computation of eigenvalues.
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+/* Determine eigenvalues for large symmetric, tridiagonal matrix. First
+ step of the computation. */
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+// declaration, forward
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Write data to global memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void writeToGmem(const unsigned int tid, const unsigned int tid_2,
+ const unsigned int num_threads_active,
+ const unsigned int num_blocks_mult,
+ NumericT *g_left_one, NumericT *g_right_one,
+ unsigned int *g_pos_one,
+ NumericT *g_left_mult, NumericT *g_right_mult,
+ unsigned int *g_left_count_mult,
+ unsigned int *g_right_count_mult,
+ NumericT *s_left, NumericT *s_right,
+ unsigned short *s_left_count, unsigned short *s_right_count,
+ unsigned int *g_blocks_mult,
+ unsigned int *g_blocks_mult_sum,
+ unsigned short *s_compaction_list,
+ unsigned short *s_cl_helper,
+ unsigned int offset_mult_lambda
+ )
+{
+
+ if (tid < offset_mult_lambda)
+ {
+
+ g_left_one[tid] = s_left[tid];
+ g_right_one[tid] = s_right[tid];
+ // right count can be used to order eigenvalues without sorting
+ g_pos_one[tid] = s_right_count[tid];
+ }
+ else
+ {
+
+
+ g_left_mult[tid - offset_mult_lambda] = s_left[tid];
+ g_right_mult[tid - offset_mult_lambda] = s_right[tid];
+ g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
+ g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
+ }
+
+ if (tid_2 < num_threads_active)
+ {
+
+ if (tid_2 < offset_mult_lambda)
+ {
+
+ g_left_one[tid_2] = s_left[tid_2];
+ g_right_one[tid_2] = s_right[tid_2];
+ // right count can be used to order eigenvalues without sorting
+ g_pos_one[tid_2] = s_right_count[tid_2];
+ }
+ else
+ {
+
+ g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
+ g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
+ g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
+ g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
+ }
+
+ } // end writing out data
+
+ __syncthreads();
+
+ // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
+ // s_compaction_list we write the exclusive scan result
+ if (tid <= num_blocks_mult)
+ {
+ g_blocks_mult[tid] = s_compaction_list[tid];
+ g_blocks_mult_sum[tid] = s_cl_helper[tid];
+ }
+
+ if (tid_2 <= num_blocks_mult)
+ {
+ g_blocks_mult[tid_2] = s_compaction_list[tid_2];
+ g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform final stream compaction before writing data to global memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void
+compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,
+ const unsigned int num_threads_active,
+ unsigned int &offset_mult_lambda,
+ NumericT *s_left, NumericT *s_right,
+ unsigned short *s_left_count, unsigned short *s_right_count,
+ unsigned short *s_cl_one, unsigned short *s_cl_mult,
+ unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
+ unsigned int is_one_lambda, unsigned int is_one_lambda_2,
+ NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2,
+ unsigned int &left_count, unsigned int &right_count,
+ unsigned int &left_count_2, unsigned int &right_count_2,
+ unsigned int c_block_iend, unsigned int c_sum_block,
+ unsigned int c_block_iend_2, unsigned int c_sum_block_2
+ )
+{
+ // cache data before performing compaction
+ left = s_left[tid];
+ right = s_right[tid];
+
+ if (tid_2 < num_threads_active)
+ {
+
+ left_2 = s_left[tid_2];
+ right_2 = s_right[tid_2];
+ }
+
+ __syncthreads();
+
+ // determine addresses for intervals containing multiple eigenvalues and
+ // addresses for blocks of intervals
+ unsigned int ptr_w = 0;
+ unsigned int ptr_w_2 = 0;
+ unsigned int ptr_blocking_w = 0;
+ unsigned int ptr_blocking_w_2 = 0;
+
+
+
+ ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
+ : s_cl_mult[tid] + offset_mult_lambda;
+
+ if (0 != c_block_iend)
+ {
+ ptr_blocking_w = s_cl_blocking[tid];
+ }
+
+ if (tid_2 < num_threads_active)
+ {
+ ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
+ : s_cl_mult[tid_2] + offset_mult_lambda;
+
+ if (0 != c_block_iend_2)
+ {
+ ptr_blocking_w_2 = s_cl_blocking[tid_2];
+ }
+ }
+
+
+ __syncthreads();
+ if(tid < num_threads_active)
+ {
+ // store compactly in shared mem
+ s_left[ptr_w] = left;
+ s_right[ptr_w] = right;
+ s_left_count[ptr_w] = left_count;
+ s_right_count[ptr_w] = right_count;
+ }
+
+
+ __syncthreads();
+ if(tid == 1)
+ {
+ s_left[ptr_w] = left;
+ s_right[ptr_w] = right;
+ s_left_count[ptr_w] = left_count;
+ s_right_count[ptr_w] = right_count;
+ }
+ if (0 != c_block_iend)
+ {
+ s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
+ s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
+ }
+
+ if (tid_2 < num_threads_active)
+ {
+
+ // store compactly in shared mem
+ s_left[ptr_w_2] = left_2;
+ s_right[ptr_w_2] = right_2;
+ s_left_count[ptr_w_2] = left_count_2;
+ s_right_count[ptr_w_2] = right_count_2;
+
+ if (0 != c_block_iend_2)
+ {
+ s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
+ s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
+ }
+ }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute addresses to obtain compact list of block start addresses
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,
+ const unsigned int num_threads_compaction,
+ unsigned short *s_cl_blocking,
+ unsigned short *s_cl_helper
+ )
+{
+ // prepare for second step of block generation: compaction of the block
+ // list itself to efficiently write out these
+ s_cl_blocking[tid] = s_cl_helper[tid];
+
+ if (tid_2 < num_threads_compaction)
+ {
+ s_cl_blocking[tid_2] = s_cl_helper[tid_2];
+ }
+
+ __syncthreads();
+
+ // additional scan to compact s_cl_blocking that permits to generate a
+ // compact list of eigenvalue blocks each one containing about
+ // VIENNACL_BISECT_MAX_THREADS_BLOCK eigenvalues (so that each of these blocks may be
+ // processed by one thread block in a subsequent processing step
+
+ unsigned int offset = 1;
+
+ // build scan tree
+ for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+ {
+
+ __syncthreads();
+
+ if (tid < d)
+ {
+
+ unsigned int ai = offset*(2*tid+1)-1;
+ unsigned int bi = offset*(2*tid+2)-1;
+ s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
+ }
+
+ offset <<= 1;
+ }
+
+ // traverse down tree: first down to level 2 across
+ for (int d = 2; d < num_threads_compaction; d <<= 1)
+ {
+
+ offset >>= 1;
+ __syncthreads();
+
+ //
+ if (tid < (d-1))
+ {
+
+ unsigned int ai = offset*(tid+1) - 1;
+ unsigned int bi = ai + (offset >> 1);
+ s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
+ }
+ }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform scan to obtain number of eigenvalues before a specific block
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
+ const unsigned int num_threads_active,
+ const unsigned int num_threads_compaction,
+ unsigned short *s_cl_blocking,
+ unsigned short *s_cl_helper)
+{
+ unsigned int offset = 1;
+
+ // first step of scan to build the sum of elements within each block
+ // build up tree
+ for (int d = num_threads_compaction >> 1; d > 0; d >>= 1)
+ {
+
+ __syncthreads();
+
+ if (tid < d)
+ {
+
+ unsigned int ai = offset*(2*tid+1)-1;
+ unsigned int bi = offset*(2*tid+2)-1;
+
+ s_cl_blocking[bi] += s_cl_blocking[ai];
+ }
+
+ offset *= 2;
+ }
+
+ // first step of scan to build the sum of elements within each block
+ // traverse down tree
+ for (int d = 2; d < (num_threads_compaction - 1); d <<= 1)
+ {
+
+ offset >>= 1;
+ __syncthreads();
+
+ if (tid < (d-1))
+ {
+
+ unsigned int ai = offset*(tid+1) - 1;
+ unsigned int bi = ai + (offset >> 1);
+
+ s_cl_blocking[bi] += s_cl_blocking[ai];
+ }
+ }
+
+ __syncthreads();
+
+ if (0 == tid)
+ {
+
+ // move last element of scan to last element that is valid
+ // necessary because the number of threads employed for scan is a power
+ // of two and not necessarily the number of active threasd
+ s_cl_helper[num_threads_active - 1] =
+ s_cl_helper[num_threads_compaction - 1];
+ s_cl_blocking[num_threads_active - 1] =
+ s_cl_blocking[num_threads_compaction - 1];
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform initial scan for compaction of intervals containing one and
+//! multiple eigenvalues; also do initial scan to build blocks
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int mat_size,
+ const unsigned int num_threads_active,
+ const unsigned int num_threads_compaction,
+ unsigned short *s_cl_one, unsigned short *s_cl_mult,
+ unsigned short *s_cl_blocking, unsigned short *s_cl_helper
+ )
+{
+
+ // perform scan to compactly write out the intervals containing one and
+ // multiple eigenvalues
+ // also generate tree for blocking of intervals containing multiple
+ // eigenvalues
+
+ unsigned int offset = 1;
+
+ // build scan tree
+ for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+ {
+
+ __syncthreads();
+
+ if (tid < d)
+ {
+
+ unsigned int ai = offset*(2*tid+1);
+ unsigned int bi = offset*(2*tid+2)-1;
+
+ s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
+ s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
+
+ // s_cl_helper is binary and zero for an internal node and 1 for a
+ // root node of a tree corresponding to a block
+ // s_cl_blocking contains the number of nodes in each sub-tree at each
+ // iteration, the data has to be kept to compute the total number of
+ // eigenvalues per block that, in turn, is needed to efficiently
+ // write out data in the second step
+ if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
+ {
+
+ // check how many childs are non terminated
+ if (s_cl_helper[ai - 1] == 1)
+ {
+ // mark as terminated
+ s_cl_helper[bi] = 1;
+ }
+ else if (s_cl_helper[bi] == 1)
+ {
+ // mark as terminated
+ s_cl_helper[ai - 1] = 1;
+ }
+ else // both childs are non-terminated
+ {
+
+ unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];
+
+ if (temp > (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2))
+ {
+
+ // the two child trees have to form separate blocks, terminate trees
+ s_cl_helper[ai - 1] = 1;
+ s_cl_helper[bi] = 1;
+ }
+ else
+ {
+ // build up tree by joining subtrees
+ s_cl_blocking[bi] = temp;
+ s_cl_blocking[ai - 1] = 0;
+ }
+ }
+ } // end s_cl_helper update
+
+ }
+
+ offset <<= 1;
+ }
+
+
+ // traverse down tree, this only for stream compaction, not for block
+ // construction
+ for (int d = 2; d < num_threads_compaction; d <<= 1)
+ {
+
+ offset >>= 1;
+ __syncthreads();
+
+ //
+ if (tid < (d-1))
+ {
+
+ unsigned int ai = offset*(tid+1) - 1;
+ unsigned int bi = ai + (offset >> 1);
+
+ s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
+ s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
+ }
+ }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Store all non-empty intervals resulting from the subdivision of the interval
+//! currently processed by the thread
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void
+storeNonEmptyIntervalsLarge(unsigned int addr,
+ const unsigned int num_threads_active,
+ NumericT *s_left, NumericT *s_right,
+ unsigned short *s_left_count,
+ unsigned short *s_right_count,
+ NumericT left, NumericT mid, NumericT right,
+ const unsigned short left_count,
+ const unsigned short mid_count,
+ const unsigned short right_count,
+ NumericT epsilon,
+ unsigned int &compact_second_chunk,
+ unsigned short *s_compaction_list,
+ unsigned int &is_active_second)
+{
+ // check if both child intervals are valid
+ if ((left_count != mid_count) && (mid_count != right_count))
+ {
+
+ storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+ left, mid, left_count, mid_count, epsilon);
+
+ is_active_second = 1;
+ s_compaction_list[threadIdx.x] = 1;
+ compact_second_chunk = 1;
+ }
+ else
+ {
+
+ // only one non-empty child interval
+
+ // mark that no second child
+ is_active_second = 0;
+ s_compaction_list[threadIdx.x] = 0;
+
+ // store the one valid child interval
+ if (left_count != mid_count)
+ {
+ storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+ left, mid, left_count, mid_count, epsilon);
+ }
+ else
+ {
+ storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+ mid, right, mid_count, right_count, epsilon);
+ }
+ }
+}
+
+/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
+* g_d diagonal elements in global memory
+* g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed and equals 0
+* n size of matrix
+* lg lower bound of input interval (e.g. Gerschgorin interval)
+* ug upper bound of input interval (e.g. Gerschgorin interval)
+* lg_eig_count number of eigenvalues that are smaller than lg
+* lu_eig_count number of eigenvalues that are smaller than lu
+* epsilon desired accuracy of eigenvalues to compute
+*/
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+ const NumericT lg, const NumericT ug,
+ const unsigned int lg_eig_count,
+ const unsigned int ug_eig_count,
+ NumericT epsilon,
+ unsigned int *g_num_one,
+ unsigned int *g_num_blocks_mult,
+ NumericT *g_left_one, NumericT *g_right_one,
+ unsigned int *g_pos_one,
+ NumericT *g_left_mult, NumericT *g_right_mult,
+ unsigned int *g_left_count_mult,
+ unsigned int *g_right_count_mult,
+ unsigned int *g_blocks_mult,
+ unsigned int *g_blocks_mult_sum
+ )
+{
+ const unsigned int tid = threadIdx.x;
+
+ // intervals (store left and right because the subdivision tree is in general
+ // not dense
+ __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+ __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+ // number of eigenvalues that are smaller than s_left / s_right
+ // (correspondence is realized via indices)
+ __shared__ unsigned short s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+ __shared__ unsigned short s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+ // helper for stream compaction
+ __shared__ unsigned short s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+ // state variables for whole block
+ // if 0 then compaction of second chunk of child intervals is not necessary
+ // (because all intervals had exactly one non-dead child)
+ __shared__ unsigned int compact_second_chunk;
+ // if 1 then all threads are converged
+ __shared__ unsigned int all_threads_converged;
+
+ // number of currently active threads
+ __shared__ unsigned int num_threads_active;
+
+ // number of threads to use for stream compaction
+ __shared__ unsigned int num_threads_compaction;
+
+ // helper for exclusive scan
+ unsigned short *s_compaction_list_exc = s_compaction_list + 1;
+
+
+ // variables for currently processed interval
+ // left and right limit of active interval
+ NumericT left = 0.0f;
+ NumericT right = 0.0f;
+ unsigned int left_count = 0;
+ unsigned int right_count = 0;
+ // midpoint of active interval
+ NumericT mid = 0.0f;
+ // number of eigenvalues smaller then mid
+ unsigned int mid_count = 0;
+ // helper for stream compaction (tracking of threads generating second child)
+ unsigned int is_active_second = 0;
+
+ // initialize lists
+ s_compaction_list[tid] = 0;
+ s_left[tid] = 0;
+ s_right[tid] = 0;
+ s_left_count[tid] = 0;
+ s_right_count[tid] = 0;
+
+ __syncthreads();
+
+ // set up initial configuration
+ if (0 == tid)
+ {
+
+ s_left[0] = lg;
+ s_right[0] = ug;
+ s_left_count[0] = lg_eig_count;
+ s_right_count[0] = ug_eig_count;
+
+ compact_second_chunk = 0;
+ num_threads_active = 1;
+
+ num_threads_compaction = 1;
+
+ all_threads_converged = 1;
+ }
+
+ __syncthreads();
+
+ // for all active threads read intervals from the last level
+ // the number of (worst case) active threads per level l is 2^l
+ // determine coarse intervals. On these intervals the kernel for one or for multiple eigenvalues
+ // will be executed in the second step
+ while(true)
+ {
+ s_compaction_list[tid] = 0;
+ s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+ s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+ subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
+ num_threads_active,
+ left, right, left_count, right_count,
+ mid, all_threads_converged);
+
+ __syncthreads();
+
+ // check if done
+ if (1 == all_threads_converged)
+ {
+ break;
+ }
+
+ // compute number of eigenvalues smaller than mid
+ // use all threads for reading the necessary matrix data from global
+ // memory
+ // use s_left and s_right as scratch space for diagonal and
+ // superdiagonal of matrix
+ mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+ mid, threadIdx.x,
+ num_threads_active,
+ s_left, s_right,
+ (left == right));
+
+ __syncthreads();
+
+ // store intervals
+ // for all threads store the first child interval in a continuous chunk of
+ // memory, and the second child interval -- if it exists -- in a second
+ // chunk; it is likely that all threads reach convergence up to
+ // \a epsilon at the same level; furthermore, for higher level most / all
+ // threads will have only one child, storing the first child compactly will
+ // (first) avoid to perform a compaction step on the first chunk, (second)
+ // make it for higher levels (when all threads / intervals have
+ // exactly one child) unnecessary to perform a compaction of the second
+ // chunk
+ if (tid < num_threads_active)
+ {
+
+ if (left != right)
+ {
+
+ // store intervals
+ storeNonEmptyIntervalsLarge(tid, num_threads_active,
+ s_left, s_right,
+ s_left_count, s_right_count,
+ left, mid, right,
+ left_count, mid_count, right_count,
+ epsilon, compact_second_chunk,
+ s_compaction_list_exc,
+ is_active_second);
+ }
+ else
+ {
+
+ // re-write converged interval (has to be stored again because s_left
+ // and s_right are used as scratch space for
+ // computeNumSmallerEigenvalsLarge()
+ s_left[tid] = left;
+ s_right[tid] = left;
+ s_left_count[tid] = left_count;
+ s_right_count[tid] = right_count;
+
+ is_active_second = 0;
+ }
+ }
+
+ // necessary so that compact_second_chunk is up-to-date
+ __syncthreads();
+
+ // perform compaction of chunk where second children are stored
+ // scan of (num_threads_active / 2) elements, thus at most
+ // (num_threads_active / 4) threads are needed
+ if (compact_second_chunk > 0)
+ {
+
+ // create indices for compaction
+ createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
+ }
+ __syncthreads();
+
+ if (compact_second_chunk > 0)
+ {
+ compactIntervals(s_left, s_right, s_left_count, s_right_count,
+ mid, right, mid_count, right_count,
+ s_compaction_list, num_threads_active,
+ is_active_second);
+ }
+
+ __syncthreads();
+
+ // update state variables
+ if (0 == tid)
+ {
+
+ // update number of active threads with result of reduction
+ num_threads_active += s_compaction_list[num_threads_active];
+ num_threads_compaction = ceilPow2(num_threads_active);
+
+ compact_second_chunk = 0;
+ all_threads_converged = 1;
+ }
+
+ __syncthreads();
+
+ if (num_threads_compaction > blockDim.x)
+ {
+ break;
+ }
+
+ }
+
+ __syncthreads();
+
+ // generate two lists of intervals; one with intervals that contain one
+ // eigenvalue (or are converged), and one with intervals that need further
+ // subdivision
+
+ // perform two scans in parallel
+
+ unsigned int left_count_2;
+ unsigned int right_count_2;
+
+ unsigned int tid_2 = tid + blockDim.x;
+
+ // cache in per thread registers so that s_left_count and s_right_count
+ // can be used for scans
+ left_count = s_left_count[tid];
+ right_count = s_right_count[tid];
+
+ // some threads have to cache data for two intervals
+ if (tid_2 < num_threads_active)
+ {
+ left_count_2 = s_left_count[tid_2];
+ right_count_2 = s_right_count[tid_2];
+ }
+
+ // compaction list for intervals containing one and multiple eigenvalues
+ // do not affect first element for exclusive scan
+ unsigned short *s_cl_one = s_left_count + 1;
+ unsigned short *s_cl_mult = s_right_count + 1;
+
+ // compaction list for generating blocks of intervals containing multiple
+ // eigenvalues
+ unsigned short *s_cl_blocking = s_compaction_list_exc;
+ // helper compaction list for generating blocks of intervals
+ __shared__ unsigned short s_cl_helper[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+ if (0 == tid)
+ {
+ // set to 0 for exclusive scan
+ s_left_count[0] = 0;
+ s_right_count[0] = 0;
+
+ }
+
+ __syncthreads();
+
+ // flag if interval contains one or multiple eigenvalues
+ unsigned int is_one_lambda = 0;
+ unsigned int is_one_lambda_2 = 0;
+
+ // number of eigenvalues in the interval
+ unsigned int multiplicity = right_count - left_count;
+ is_one_lambda = (1 == multiplicity);
+
+ s_cl_one[tid] = is_one_lambda;
+ s_cl_mult[tid] = (! is_one_lambda);
+
+ // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
+ s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
+ s_cl_helper[tid] = 0;
+
+ if (tid_2 < num_threads_active)
+ {
+
+ unsigned int multiplicity = right_count_2 - left_count_2;
+ is_one_lambda_2 = (1 == multiplicity);
+
+ s_cl_one[tid_2] = is_one_lambda_2;
+ s_cl_mult[tid_2] = (! is_one_lambda_2);
+
+ // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
+ s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
+ s_cl_helper[tid_2] = 0;
+ }
+ else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1))
+ {
+
+ // clear
+ s_cl_blocking[tid_2] = 0;
+ s_cl_helper[tid_2] = 0;
+ }
+
+
+ scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction,
+ s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
+
+ __syncthreads();
+
+ scanSumBlocks(tid, tid_2, num_threads_active,
+ num_threads_compaction, s_cl_blocking, s_cl_helper);
+
+ // end down sweep of scan
+ __syncthreads();
+
+ unsigned int c_block_iend = 0;
+ unsigned int c_block_iend_2 = 0;
+ unsigned int c_sum_block = 0;
+ unsigned int c_sum_block_2 = 0;
+
+ // for each thread / interval that corresponds to root node of interval block
+ // store start address of block and total number of eigenvalues in all blocks
+ // before this block (particular thread is irrelevant, constraint is to
+ // have a subset of threads so that one and only one of them is in each
+ // interval)
+ if (1 == s_cl_helper[tid])
+ {
+
+ c_block_iend = s_cl_mult[tid] + 1;
+ c_sum_block = s_cl_blocking[tid];
+ }
+
+ if (1 == s_cl_helper[tid_2])
+ {
+
+ c_block_iend_2 = s_cl_mult[tid_2] + 1;
+ c_sum_block_2 = s_cl_blocking[tid_2];
+ }
+
+ scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
+ s_cl_blocking, s_cl_helper);
+
+
+ // finished second scan for s_cl_blocking
+ __syncthreads();
+
+ // determine the global results
+ __shared__ unsigned int num_blocks_mult;
+ __shared__ unsigned int num_mult;
+ __shared__ unsigned int offset_mult_lambda;
+
+ if (0 == tid)
+ {
+
+ num_blocks_mult = s_cl_blocking[num_threads_active - 1];
+ offset_mult_lambda = s_cl_one[num_threads_active - 1];
+ num_mult = s_cl_mult[num_threads_active - 1];
+
+ *g_num_one = offset_mult_lambda;
+ *g_num_blocks_mult = num_blocks_mult;
+ }
+
+ __syncthreads();
+
+ NumericT left_2, right_2;
+ --s_cl_one;
+ --s_cl_mult;
+ --s_cl_blocking;
+
+ __syncthreads();
+ compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
+ s_left, s_right, s_left_count, s_right_count,
+ s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
+ is_one_lambda, is_one_lambda_2,
+ left, right, left_2, right_2,
+ left_count, right_count, left_count_2, right_count_2,
+ c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2
+ );
+
+ __syncthreads();
+
+ // final adjustment before writing out data to global memory
+ if (0 == tid)
+ {
+ s_cl_blocking[num_blocks_mult] = num_mult;
+ s_cl_helper[0] = 0;
+ }
+
+ __syncthreads();
+
+ // write to global memory
+ writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
+ g_left_one, g_right_one, g_pos_one,
+ g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,
+ s_left, s_right, s_left_count, s_right_count,
+ g_blocks_mult, g_blocks_mult_sum,
+ s_compaction_list, s_cl_helper, offset_mult_lambda);
+
+}
+}
+}
+}
+#endif // #ifndef _BISECT_KERNEL_LARGE_H_
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
new file mode 100755
index 0000000..a670256
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
@@ -0,0 +1,277 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
+ @brief Second step of the bisection algorithm for the computation of eigenvalues for large matrices.
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+/* Perform second step of bisection algorithm for large matrices for
+ * intervals that contained after the first step more than one eigenvalue
+ */
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Perform second step of bisection algorithm for large matrices for
+//! intervals that after the first step contained more than one eigenvalue
+//! @param g_d diagonal elements of symmetric, tridiagonal matrix
+//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix
+//! @param n matrix size
+//! @param blocks_mult start addresses of blocks of intervals that are
+//! processed by one block of threads, each of the
+//! intervals contains more than one eigenvalue
+//! @param blocks_mult_sum total number of eigenvalues / singleton intervals
+//! in one block of intervals
+//! @param g_left left limits of intervals
+//! @param g_right right limits of intervals
+//! @param g_left_count number of eigenvalues less than left limits
+//! @param g_right_count number of eigenvalues less than right limits
+//! @param g_lambda final eigenvalue
+//! @param g_pos index of eigenvalue (in ascending order)
+//! @param precision desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+ unsigned int *blocks_mult,
+ unsigned int *blocks_mult_sum,
+ NumericT *g_left, NumericT *g_right,
+ unsigned int *g_left_count,
+ unsigned int *g_right_count,
+ NumericT *g_lambda, unsigned int *g_pos,
+ NumericT precision
+ )
+{
+ const unsigned int tid = threadIdx.x;
+
+ // left and right limits of interval
+ __shared__ NumericT s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+ __shared__ NumericT s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+ // number of eigenvalues smaller than interval limits
+ __shared__ unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+ __shared__ unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+ // helper array for chunk compaction of second chunk
+ __shared__ unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+ // compaction list helper for exclusive scan
+ unsigned int *s_compaction_list_exc = s_compaction_list + 1;
+
+ // flag if all threads are converged
+ __shared__ unsigned int all_threads_converged;
+ // number of active threads
+ __shared__ unsigned int num_threads_active;
+ // number of threads to employ for compaction
+ __shared__ unsigned int num_threads_compaction;
+ // flag if second chunk has to be compacted
+ __shared__ unsigned int compact_second_chunk;
+
+ // parameters of block of intervals processed by this block of threads
+ __shared__ unsigned int c_block_start;
+ __shared__ unsigned int c_block_end;
+ __shared__ unsigned int c_block_offset_output;
+
+ // midpoint of currently active interval of the thread
+ NumericT mid = 0.0f;
+ // number of eigenvalues smaller than \a mid
+ unsigned int mid_count = 0;
+ // current interval parameter
+ NumericT left = 0.0f;
+ NumericT right = 0.0f;
+ unsigned int left_count = 0;
+ unsigned int right_count = 0;
+ // helper for compaction, keep track which threads have a second child
+ unsigned int is_active_second = 0;
+
+
+ __syncthreads();
+ // initialize common start conditions
+ if (0 == tid)
+ {
+
+ c_block_start = blocks_mult[blockIdx.x];
+ c_block_end = blocks_mult[blockIdx.x + 1];
+ c_block_offset_output = blocks_mult_sum[blockIdx.x];
+
+
+ num_threads_active = c_block_end - c_block_start;
+ s_compaction_list[0] = 0;
+ num_threads_compaction = ceilPow2(num_threads_active);
+
+ all_threads_converged = 1;
+ compact_second_chunk = 0;
+ }
+
+ s_left_count [tid] = 42;
+ s_right_count[tid] = 42;
+ s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+ s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+
+ __syncthreads();
+
+
+ // read data into shared memory
+ if (tid < num_threads_active)
+ {
+ s_left[tid] = g_left[c_block_start + tid];
+ s_right[tid] = g_right[c_block_start + tid];
+ s_left_count[tid] = g_left_count[c_block_start + tid];
+ s_right_count[tid] = g_right_count[c_block_start + tid];
+ }
+
+ __syncthreads();
+ unsigned int iter = 0;
+ // do until all threads converged
+ while (true)
+ {
+ iter++;
+ //for (int iter=0; iter < 0; iter++) {
+ s_compaction_list[threadIdx.x] = 0;
+ s_compaction_list[threadIdx.x + blockDim.x] = 0;
+ s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+
+ // subdivide interval if currently active and not already converged
+ subdivideActiveIntervalMulti(tid, s_left, s_right,
+ s_left_count, s_right_count,
+ num_threads_active,
+ left, right, left_count, right_count,
+ mid, all_threads_converged);
+ __syncthreads();
+
+ // stop if all eigenvalues have been found
+ if (1 == all_threads_converged)
+ {
+
+ break;
+ }
+
+ // compute number of eigenvalues smaller than mid for active and not
+ // converged intervals, use all threads for loading data from gmem and
+ // s_left and s_right as scratch space to store the data load from gmem
+ // in shared memory
+ mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+ mid, tid, num_threads_active,
+ s_left, s_right,
+ (left == right));
+
+ __syncthreads();
+
+ if (tid < num_threads_active)
+ {
+
+ // store intervals
+ if (left != right)
+ {
+
+ storeNonEmptyIntervals(tid, num_threads_active,
+ s_left, s_right, s_left_count, s_right_count,
+ left, mid, right,
+ left_count, mid_count, right_count,
+ precision, compact_second_chunk,
+ s_compaction_list_exc,
+ is_active_second);
+
+ }
+ else
+ {
+
+ storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,
+ left, mid, right,
+ left_count, mid_count, right_count,
+ s_compaction_list_exc, compact_second_chunk,
+ num_threads_active,
+ is_active_second);
+
+ }
+ }
+
+ __syncthreads();
+
+ // compact second chunk of intervals if any of the threads generated
+ // two child intervals
+ if (1 == compact_second_chunk)
+ {
+
+ createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
+ compactIntervals(s_left, s_right, s_left_count, s_right_count,
+ mid, right, mid_count, right_count,
+ s_compaction_list, num_threads_active,
+ is_active_second);
+ }
+
+ __syncthreads();
+
+ // update state variables
+ if (0 == tid)
+ {
+ num_threads_active += s_compaction_list[num_threads_active];
+ num_threads_compaction = ceilPow2(num_threads_active);
+
+ compact_second_chunk = 0;
+ all_threads_converged = 1;
+ }
+
+ __syncthreads();
+
+ // clear
+ s_compaction_list_exc[threadIdx.x] = 0;
+ s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
+
+ if (num_threads_compaction > blockDim.x)
+ {
+ break;
+ }
+
+
+ __syncthreads();
+
+ } // end until all threads converged
+
+ // write data back to global memory
+ if (tid < num_threads_active)
+ {
+
+ unsigned int addr = c_block_offset_output + tid;
+
+ g_lambda[addr] = s_left[tid];
+ g_pos[addr] = s_right_count[tid];
+ }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+
+#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
[32/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
new file mode 100644
index 0000000..45d6987
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
@@ -0,0 +1,1468 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/matrix_operations_row.hpp
+ @brief Implementations of row-major dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_row_kernel(
+ DestNumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const SrcNumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2];
+}
+
+//Matrix transpose kernel
+template<typename NumericT>
+__global__ void trans_kernel(
+ const NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_stride1, unsigned int A_stride2,
+
+ NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+ unsigned int B_stride1, unsigned int B_stride2,
+ bool data_major)
+{
+ for(unsigned int row = blockIdx.x; row<A_size1; row+=gridDim.x)
+ {
+ for(unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
+ {
+ if(data_major)
+ B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)];
+ else
+ B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1];
+ }
+ }
+}
+
+//
+// am
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void am_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
+ }
+}
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void am_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
+ }
+}
+
+
+//
+// ambm
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+}
+
+
+//
+// ambm_m
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
+ }
+ else
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
+ + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
+ }
+ }
+}
+
+//
+// assignments
+//
+
+template<typename NumericT>
+__global__ void matrix_row_assign_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ NumericT alpha)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
+}
+
+
+template<typename NumericT>
+__global__ void matrix_row_diagonal_assign_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ NumericT alpha)
+{
+ unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+ for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha;
+}
+
+//
+// binary element-wise operations
+//
+
+template<typename NumericT>
+__global__ void element_op_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2,
+
+ unsigned int op_type) //0: product, 1: division, 2: pow
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (op_type == 2)
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
+ C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
+ }
+ else if (op_type == 1)
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+ / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+ }
+ else if (op_type == 0)
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+ * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+ }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+
+ const NumericT * C,
+ unsigned int C_start1, unsigned int C_start2,
+ unsigned int C_inc1, unsigned int C_inc2,
+ unsigned int C_internal_size1, unsigned int C_internal_size2,
+
+ unsigned int op_type) //0: product, 1: division, 2: pow
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ if (op_type == 1)
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+ / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+ }
+ else if (op_type == 0)
+ {
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
+ = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
+ * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
+ }
+}
+
+//
+// unary element-wise operations
+//
+
+// abs
+template<typename NumericT>
+__global__ void matrix_row_element_abs_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// acos
+template<typename NumericT>
+__global__ void matrix_row_element_acos_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// asin
+template<typename NumericT>
+__global__ void matrix_row_element_asin_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void matrix_row_element_atan_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void matrix_row_element_ceil_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void matrix_row_element_cos_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void matrix_row_element_cosh_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void matrix_row_element_exp_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void matrix_row_element_fabs_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// floor
+template<typename NumericT>
+__global__ void matrix_row_element_floor_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// log
+template<typename NumericT>
+__global__ void matrix_row_element_log_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void matrix_row_element_log10_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void matrix_row_element_sin_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void matrix_row_element_sinh_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void matrix_row_element_sqrt_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void matrix_row_element_tan_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void matrix_row_element_tanh_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2)
+{
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
+}
+
+
+
+//
+// matrix-vector product
+//
+
+template<typename NumericT>
+__global__ void vec_mul_row_kernel(
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * v,
+ unsigned int v_start,
+ unsigned int v_inc,
+ unsigned int v_size,
+ NumericT * result,
+ unsigned int result_start,
+ unsigned int result_inc,
+ unsigned int result_size)
+{
+ __shared__ NumericT work[128];
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+ unsigned int lid = threadIdx.x;
+
+ for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
+ {
+ NumericT dot_prod = 0;
+ for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
+ dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
+ work[lid] = dot_prod;
+
+ for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
+ __syncthreads();
+ if (lid < stride)
+ work[lid] += work[lid+stride];
+ }
+
+ if (lid == 0)
+ result[row * result_inc + result_start] = work[0];
+ }
+}
+
+
+template<typename NumericT>
+__global__ void trans_vec_mul_row_kernel(
+ const NumericT * A,
+ unsigned int A_row_start,
+ unsigned int A_col_start,
+ unsigned int A_row_inc,
+ unsigned int A_col_inc,
+ unsigned int A_row_size,
+ unsigned int A_col_size,
+ unsigned int A_internal_rows,
+ unsigned int A_internal_cols,
+ const NumericT * v,
+ unsigned int v_start,
+ unsigned int v_inc,
+ unsigned int v_size,
+ NumericT * result,
+ unsigned int result_start,
+ unsigned int result_inc,
+ unsigned int result_size)
+{
+ for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x)
+ {
+ NumericT dot_prod = 0;
+ for (unsigned int col = 0; col < A_row_size; ++col)
+ dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
+ result[row * result_inc + result_start] = dot_prod;
+ }
+}
+
+
+//
+// matrix-matrix products
+//
+
+
+
+
+//
+// scaled rank-1-update
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ NumericT val,
+ unsigned int options2,
+
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+ unsigned int size2)
+{
+ NumericT alpha = val;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ {
+ NumericT tmp = alpha * vec1[row * inc1 + start1];
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
+ }
+}
+
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_row_kernel(
+ NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+
+ const NumericT * val,
+ unsigned int options2,
+
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+ unsigned int size2)
+{
+ NumericT alpha = *val;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+ unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+ for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+ {
+ NumericT tmp = alpha * vec1[row * inc1 + start1];
+ for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+ A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
+ }
+}
+
+
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
new file mode 100644
index 0000000..4821f5b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
@@ -0,0 +1,91 @@
+#ifndef VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/misc_operations.hpp
+ @brief Implementations of miscellaneous operations using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename NumericT>
+__global__ void level_scheduling_substitute_kernel(
+ const unsigned int * row_index_array,
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * vec,
+ unsigned int size)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < size;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int eq_row = row_index_array[row];
+ NumericT vec_entry = vec[eq_row];
+ unsigned int row_end = row_indices[row+1];
+
+ for (unsigned int j = row_indices[row]; j < row_end; ++j)
+ vec_entry -= vec[column_indices[j]] * elements[j];
+
+ vec[eq_row] = vec_entry;
+ }
+}
+
+
+
+template<typename NumericT>
+void level_scheduling_substitute(vector<NumericT> & vec,
+ viennacl::backend::mem_handle const & row_index_array,
+ viennacl::backend::mem_handle const & row_buffer,
+ viennacl::backend::mem_handle const & col_buffer,
+ viennacl::backend::mem_handle const & element_buffer,
+ vcl_size_t num_rows
+ )
+{
+ level_scheduling_substitute_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(row_index_array),
+ viennacl::cuda_arg<unsigned int>(row_buffer),
+ viennacl::cuda_arg<unsigned int>(col_buffer),
+ viennacl::cuda_arg<NumericT>(element_buffer),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(num_rows)
+ );
+}
+
+} //namespace detail
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
new file mode 100644
index 0000000..109f74f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
@@ -0,0 +1,152 @@
+#ifndef VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/vector_operations.hpp
+ @brief Implementations of NMF operations using CUDA
+ */
+
+#include "viennacl/linalg/host_based/nmf_operations.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Main CUDA kernel for nonnegative matrix factorization of a dense matrices. */
+template<typename NumericT>
+__global__ void el_wise_mul_div(NumericT * matrix1,
+ NumericT const * matrix2,
+ NumericT const * matrix3,
+ unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i +=gridDim.x * blockDim.x)
+ {
+ NumericT val = matrix1[i] * matrix2[i];
+ NumericT divisor = matrix3[i];
+ matrix1[i] = (divisor > (NumericT) 0.00001) ? (val / divisor) : NumericT(0);
+ }
+}
+
+/** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+ *
+ * @param V Input matrix
+ * @param W First factor
+ * @param H Second factor
+ * @param conf A configuration object holding tolerances and the like
+ */
+template<typename NumericT>
+void nmf(viennacl::matrix_base<NumericT> const & V,
+ viennacl::matrix_base<NumericT> & W,
+ viennacl::matrix_base<NumericT> & H,
+ viennacl::linalg::nmf_config const & conf)
+{
+ vcl_size_t k = W.size2();
+ conf.iters_ = 0;
+
+ if (!viennacl::linalg::norm_frobenius(W))
+ W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1.0));
+
+ if (!viennacl::linalg::norm_frobenius(H))
+ H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1.0));
+
+ viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major());
+ viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major());
+ viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major());
+
+ viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major());
+ viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major());
+ viennacl::matrix_base<NumericT> htmp(k, k, H.row_major());
+
+ viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major());
+
+ viennacl::vector<NumericT> diff(V.size1() * V.size2());
+
+ NumericT last_diff = 0;
+ NumericT diff_init = 0;
+ bool stagnation_flag = false;
+
+ for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+ {
+ conf.iters_ = i + 1;
+
+ hn = viennacl::linalg::prod(trans(W), V);
+ htmp = viennacl::linalg::prod(trans(W), W);
+ hd = viennacl::linalg::prod(htmp, H);
+
+ el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(H),
+ viennacl::cuda_arg<NumericT>(hn),
+ viennacl::cuda_arg<NumericT>(hd),
+ static_cast<unsigned int>(H.internal_size1() * H.internal_size2()));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div");
+
+ wn = viennacl::linalg::prod(V, trans(H));
+ wtmp = viennacl::linalg::prod(W, H);
+ wd = viennacl::linalg::prod(wtmp, trans(H));
+
+ el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(W),
+ viennacl::cuda_arg<NumericT>(wn),
+ viennacl::cuda_arg<NumericT>(wd),
+ static_cast<unsigned int>( W.internal_size1() * W.internal_size2()));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div");
+
+ if (i % conf.check_after_steps() == 0) //check for convergence
+ {
+ appr = viennacl::linalg::prod(W, H);
+
+ appr -= V;
+ NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
+
+ if (i == 0)
+ diff_init = diff_val;
+
+ if (conf.print_relative_error())
+ std::cout << diff_val / diff_init << std::endl;
+
+ // Approximation check
+ if (diff_val / diff_init < conf.tolerance())
+ break;
+
+ // Stagnation check
+ if (std::fabs(diff_val - last_diff) / (diff_val * conf.check_after_steps()) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
+ {
+ if (stagnation_flag) // iteration stagnates (two iterates with no notable progress)
+ break;
+ else
+ // record stagnation in this iteration
+ stagnation_flag = true;
+ } else
+ // good progress in this iteration, so unset stagnation flag
+ stagnation_flag = false;
+
+ // prepare for next iterate:
+ last_diff = diff_val;
+ }
+ }
+}
+
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
new file mode 100644
index 0000000..3adaca2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
@@ -0,0 +1,375 @@
+#ifndef VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/scalar_operations.hpp
+ @brief Implementations of scalar operations using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/////////////////// as /////////////////////////////
+
+template<typename NumericT>
+__global__ void as_kernel(NumericT * s1, const NumericT * fac2, unsigned int options2, const NumericT * s2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ *s1 = *s2 * alpha;
+}
+
+template<typename NumericT>
+__global__ void as_kernel(NumericT * s1, NumericT fac2, unsigned int options2, const NumericT * s2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ *s1 = *s2 * alpha;
+}
+
+template<typename ScalarT1,
+ typename ScalarT2, typename NumericT>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_any_scalar<NumericT>::value
+ >::type
+as(ScalarT1 & s1,
+ ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<NumericT>::value)
+ temporary_alpha = alpha;
+
+ as_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(s2));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("as_kernel");
+}
+
+//////////////////// asbs ////////////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+ const NumericT * fac2, unsigned int options2, const NumericT * s2,
+ const NumericT * fac3, unsigned int options3, const NumericT * s3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+ if (options3 & (1 << 1))
+ beta = NumericT(1) / beta;
+
+ *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+ NumericT fac2, unsigned int options2, const NumericT * s2,
+ NumericT const * fac3, unsigned int options3, const NumericT * s3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+ if (options3 & (1 << 1))
+ beta = NumericT(1) / beta;
+
+ *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+ NumericT const * fac2, unsigned int options2, const NumericT * s2,
+ NumericT fac3, unsigned int options3, const NumericT * s3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+ if (options3 & (1 << 1))
+ beta = NumericT(1) / beta;
+
+ *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+ NumericT fac2, unsigned int options2, const NumericT * s2,
+ NumericT fac3, unsigned int options3, const NumericT * s3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+ if (options3 & (1 << 1))
+ beta = NumericT(1) / beta;
+
+ *s1 = *s2 * alpha + *s3 * beta;
+}
+
+
+template<typename ScalarT1,
+ typename ScalarT2, typename NumericT1,
+ typename ScalarT3, typename NumericT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_scalar<ScalarT3>::value
+ && viennacl::is_any_scalar<NumericT1>::value
+ && viennacl::is_any_scalar<NumericT2>::value
+ >::type
+asbs(ScalarT1 & s1,
+ ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ ScalarT3 const & s3, NumericT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<NumericT1>::value)
+ temporary_alpha = alpha;
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<NumericT2>::value)
+ temporary_beta = beta;
+
+ asbs_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(s2),
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(s3) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_kernel");
+}
+
+//////////////////// asbs_s ////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+ const NumericT * fac2, unsigned int options2, const NumericT * s2,
+ const NumericT * fac3, unsigned int options3, const NumericT * s3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+ if (options3 & (1 << 1))
+ beta = NumericT(1) / beta;
+
+ *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+ NumericT fac2, unsigned int options2, const NumericT * s2,
+ NumericT const * fac3, unsigned int options3, const NumericT * s3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+ if (options3 & (1 << 1))
+ beta = NumericT(1) / beta;
+
+ *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+ NumericT const * fac2, unsigned int options2, const NumericT * s2,
+ NumericT fac3, unsigned int options3, const NumericT * s3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+ if (options3 & (1 << 1))
+ beta = NumericT(1) / beta;
+
+ *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+ NumericT fac2, unsigned int options2, const NumericT * s2,
+ NumericT fac3, unsigned int options3, const NumericT * s3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+ if (options2 & (1 << 1))
+ alpha = NumericT(1) / alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+ if (options3 & (1 << 1))
+ beta = NumericT(1) / beta;
+
+ *s1 += *s2 * alpha + *s3 * beta;
+}
+
+
+template<typename ScalarT1,
+ typename ScalarT2, typename NumericT1,
+ typename ScalarT3, typename NumericT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_scalar<ScalarT3>::value
+ && viennacl::is_any_scalar<NumericT1>::value
+ && viennacl::is_any_scalar<NumericT2>::value
+ >::type
+asbs_s(ScalarT1 & s1,
+ ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ ScalarT3 const & s3, NumericT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<NumericT1>::value)
+ temporary_alpha = alpha;
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<NumericT2>::value)
+ temporary_beta = beta;
+
+ std::cout << "Launching asbs_s_kernel..." << std::endl;
+ asbs_s_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(s2),
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(s3) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_s_kernel");
+}
+
+///////////////// swap //////////////////
+
+template<typename NumericT>
+__global__ void scalar_swap_kernel(NumericT * s1, NumericT * s2)
+{
+ NumericT tmp = *s2;
+ *s2 = *s1;
+ *s1 = tmp;
+}
+
+/** @brief Swaps the contents of two scalars, data is copied
+*
+* @param s1 The first scalar
+* @param s2 The second scalar
+*/
+template<typename ScalarT1, typename ScalarT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ >::type
+swap(ScalarT1 & s1, ScalarT2 & s2)
+{
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+
+ scalar_swap_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), viennacl::cuda_arg(s2));
+}
+
+
+
+} //namespace single_threaded
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[19/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/misc_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/misc_operations.hpp
new file mode 100644
index 0000000..11061d9
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/misc_operations.hpp
@@ -0,0 +1,80 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/misc_operations.hpp
+ @brief Implementations of miscellaneous operations on the CPU using a single thread or OpenMP.
+*/
+
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+ template<typename NumericT>
+ void level_scheduling_substitute(vector<NumericT> & vec,
+ viennacl::backend::mem_handle const & row_index_array,
+ viennacl::backend::mem_handle const & row_buffer,
+ viennacl::backend::mem_handle const & col_buffer,
+ viennacl::backend::mem_handle const & element_buffer,
+ vcl_size_t num_rows
+ )
+ {
+ NumericT * vec_buf = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(vec.handle());
+
+ unsigned int const * elim_row_index = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(row_index_array);
+ unsigned int const * elim_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(row_buffer);
+ unsigned int const * elim_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(col_buffer);
+ NumericT const * elim_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(element_buffer);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row=0; row < static_cast<long>(num_rows); ++row)
+ {
+ unsigned int eq_row = elim_row_index[row];
+ unsigned int row_end = elim_row_buffer[row+1];
+ NumericT vec_entry = vec_buf[eq_row];
+
+ for (vcl_size_t j = elim_row_buffer[row]; j < row_end; ++j)
+ vec_entry -= vec_buf[elim_col_buffer[j]] * elim_elements[j];
+
+ vec_buf[eq_row] = vec_entry;
+ }
+
+ }
+}
+
+} // namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/nmf_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/nmf_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/nmf_operations.hpp
new file mode 100644
index 0000000..bb6557f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/nmf_operations.hpp
@@ -0,0 +1,247 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_NMF_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_NMF_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/host_based/vector_operations.hpp
+ @brief Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU
+ */
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_frobenius.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Configuration class for the nonnegative-matrix-factorization algorithm. Specify tolerances, maximum iteration counts, etc., here. */
+class nmf_config
+{
+public:
+ nmf_config(double val_epsilon = 1e-4, double val_epsilon_stagnation = 1e-5,
+ vcl_size_t num_max_iters = 10000, vcl_size_t num_check_iters = 100) :
+ eps_(val_epsilon), stagnation_eps_(val_epsilon_stagnation), max_iters_(num_max_iters), check_after_steps_(
+ (num_check_iters > 0) ? num_check_iters : 1), print_relative_error_(false), iters_(0)
+ {
+ }
+
+ /** @brief Returns the relative tolerance for convergence */
+ double tolerance() const
+ {
+ return eps_;
+ }
+
+ /** @brief Sets the relative tolerance for convergence, i.e. norm(V - W * H) / norm(V - W_init * H_init) */
+ void tolerance(double e)
+ {
+ eps_ = e;
+ }
+
+ /** @brief Relative tolerance for the stagnation check */
+ double stagnation_tolerance() const
+ {
+ return stagnation_eps_;
+ }
+
+ /** @brief Sets the tolerance for the stagnation check (i.e. the minimum required relative change of the residual between two iterations) */
+ void stagnation_tolerance(double e)
+ {
+ stagnation_eps_ = e;
+ }
+
+ /** @brief Returns the maximum number of iterations for the NMF algorithm */
+ vcl_size_t max_iterations() const
+ {
+ return max_iters_;
+ }
+ /** @brief Sets the maximum number of iterations for the NMF algorithm */
+ void max_iterations(vcl_size_t m)
+ {
+ max_iters_ = m;
+ }
+
+ /** @brief Returns the number of iterations of the last NMF run using this configuration object */
+ vcl_size_t iters() const
+ {
+ return iters_;
+ }
+
+ /** @brief Number of steps after which the convergence of NMF should be checked (again) */
+ vcl_size_t check_after_steps() const
+ {
+ return check_after_steps_;
+ }
+ /** @brief Set the number of steps after which the convergence of NMF should be checked (again) */
+ void check_after_steps(vcl_size_t c)
+ {
+ if (c > 0)
+ check_after_steps_ = c;
+ }
+
+ /** @brief Returns the flag specifying whether the relative tolerance should be printed in each iteration */
+ bool print_relative_error() const
+ {
+ return print_relative_error_;
+ }
+ /** @brief Specify whether the relative error should be printed at each convergence check after 'num_check_iters' steps */
+ void print_relative_error(bool b)
+ {
+ print_relative_error_ = b;
+ }
+
+ template<typename ScalarType>
+ friend void nmf(viennacl::matrix_base<ScalarType> const & V,
+ viennacl::matrix_base<ScalarType> & W, viennacl::matrix_base<ScalarType> & H,
+ nmf_config const & conf);
+
+private:
+ double eps_;
+ double stagnation_eps_;
+ vcl_size_t max_iters_;
+ vcl_size_t check_after_steps_;
+ bool print_relative_error_;
+public:
+ mutable vcl_size_t iters_;
+};
+
+namespace host_based
+{
+ /** @brief Missing OpenMP kernel for nonnegative matrix factorization of a dense matrices. */
+ template<typename NumericT>
+ void el_wise_mul_div(NumericT * matrix1,
+ NumericT const * matrix2,
+ NumericT const * matrix3, vcl_size_t size)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+#pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ NumericT val = matrix1[i] * matrix2[i];
+ NumericT divisor = matrix3[i];
+ matrix1[i] = (divisor > (NumericT) 0.00001) ? (val / divisor) : (NumericT) 0;
+ }
+ }
+
+ /** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+ *
+ * @param V Input matrix
+ * @param W First factor
+ * @param H Second factor
+ * @param conf A configuration object holding tolerances and the like
+ */
+ template<typename NumericT>
+ void nmf(viennacl::matrix_base<NumericT> const & V,
+ viennacl::matrix_base<NumericT> & W,
+ viennacl::matrix_base<NumericT> & H,
+ viennacl::linalg::nmf_config const & conf)
+ {
+ vcl_size_t k = W.size2();
+ conf.iters_ = 0;
+
+ if (viennacl::linalg::norm_frobenius(W) <= 0)
+ W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1.0));
+
+ if (viennacl::linalg::norm_frobenius(H) <= 0)
+ H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1.0));
+
+ viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major());
+ viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major());
+ viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major());
+
+ viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major());
+ viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major());
+ viennacl::matrix_base<NumericT> htmp(k, k, H.row_major());
+
+ viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major());
+
+ NumericT last_diff = 0;
+ NumericT diff_init = 0;
+ bool stagnation_flag = false;
+
+ for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+ {
+ conf.iters_ = i + 1;
+
+ hn = viennacl::linalg::prod(trans(W), V);
+ htmp = viennacl::linalg::prod(trans(W), W);
+ hd = viennacl::linalg::prod(htmp, H);
+
+ NumericT * data_H = detail::extract_raw_pointer<NumericT>(H);
+ NumericT * data_hn = detail::extract_raw_pointer<NumericT>(hn);
+ NumericT * data_hd = detail::extract_raw_pointer<NumericT>(hd);
+
+ viennacl::linalg::host_based::el_wise_mul_div(data_H, data_hn, data_hd, H.internal_size1() * H.internal_size2());
+
+ wn = viennacl::linalg::prod(V, trans(H));
+ wtmp = viennacl::linalg::prod(W, H);
+ wd = viennacl::linalg::prod(wtmp, trans(H));
+
+ NumericT * data_W = detail::extract_raw_pointer<NumericT>(W);
+ NumericT * data_wn = detail::extract_raw_pointer<NumericT>(wn);
+ NumericT * data_wd = detail::extract_raw_pointer<NumericT>(wd);
+
+ viennacl::linalg::host_based::el_wise_mul_div(data_W, data_wn, data_wd, W.internal_size1() * W.internal_size2());
+
+ if (i % conf.check_after_steps() == 0) //check for convergence
+ {
+ appr = viennacl::linalg::prod(W, H);
+
+ appr -= V;
+ NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
+
+ if (i == 0)
+ diff_init = diff_val;
+
+ if (conf.print_relative_error())
+ std::cout << diff_val / diff_init << std::endl;
+
+ // Approximation check
+ if (diff_val / diff_init < conf.tolerance())
+ break;
+
+ // Stagnation check
+ if (std::fabs(diff_val - last_diff) / (diff_val * NumericT(conf.check_after_steps())) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
+ {
+ if (stagnation_flag) // iteration stagnates (two iterates with no notable progress)
+ break;
+ else
+ // record stagnation in this iteration
+ stagnation_flag = true;
+ } else
+ // good progress in this iteration, so unset stagnation flag
+ stagnation_flag = false;
+
+ // prepare for next iterate:
+ last_diff = diff_val;
+ }
+ }
+ }
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* VIENNACL_LINALG_HOST_BASED_NMF_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/scalar_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/scalar_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/scalar_operations.hpp
new file mode 100644
index 0000000..f8a1f3b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/scalar_operations.hpp
@@ -0,0 +1,162 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/scalar_operations.hpp
+ @brief Implementations of scalar operations using a plain single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+template<typename ScalarT1,
+ typename ScalarT2, typename FactorT>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_any_scalar<FactorT>::value
+ >::type
+as(ScalarT1 & s1,
+ ScalarT2 const & s2, FactorT const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+
+ value_type * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+ value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ *data_s1 = *data_s2 * data_alpha;
+}
+
+
+template<typename ScalarT1,
+ typename ScalarT2, typename FactorT2,
+ typename ScalarT3, typename FactorT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_scalar<ScalarT3>::value
+ && viennacl::is_any_scalar<FactorT2>::value
+ && viennacl::is_any_scalar<FactorT3>::value
+ >::type
+asbs(ScalarT1 & s1,
+ ScalarT2 const & s2, FactorT2 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ ScalarT3 const & s3, FactorT3 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+
+ value_type * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+ value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+ value_type const * data_s3 = detail::extract_raw_pointer<value_type>(s3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = -data_beta;
+ if (reciprocal_beta)
+ data_beta = static_cast<value_type>(1) / data_beta;
+
+ *data_s1 = *data_s2 * data_alpha + *data_s3 * data_beta;
+}
+
+
+template<typename ScalarT1,
+ typename ScalarT2, typename FactorT2,
+ typename ScalarT3, typename FactorT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_scalar<ScalarT3>::value
+ && viennacl::is_any_scalar<FactorT2>::value
+ && viennacl::is_any_scalar<FactorT3>::value
+ >::type
+asbs_s(ScalarT1 & s1,
+ ScalarT2 const & s2, FactorT2 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ ScalarT3 const & s3, FactorT3 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+
+ value_type * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+ value_type const * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+ value_type const * data_s3 = detail::extract_raw_pointer<value_type>(s3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = -data_beta;
+ if (reciprocal_beta)
+ data_beta = static_cast<value_type>(1) / data_beta;
+
+ *data_s1 += *data_s2 * data_alpha + *data_s3 * data_beta;
+}
+
+
+/** @brief Swaps the contents of two scalars, data is copied
+*
+* @param s1 The first scalar
+* @param s2 The second scalar
+*/
+template<typename ScalarT1, typename ScalarT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ >::type
+swap(ScalarT1 & s1, ScalarT2 & s2)
+{
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+
+ value_type * data_s1 = detail::extract_raw_pointer<value_type>(s1);
+ value_type * data_s2 = detail::extract_raw_pointer<value_type>(s2);
+
+ value_type temp = *data_s2;
+ *data_s2 = *data_s1;
+ *data_s1 = temp;
+}
+
+
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[47/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
new file mode 100644
index 0000000..e463e88
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
@@ -0,0 +1,101 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl.hpp"
+#include "viennacl/backend/mem_handle.hpp"
+
+
+
+static ViennaCLStatus init_cuda_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+#ifdef VIENNACL_WITH_CUDA
+ h.switch_active_handle_id(viennacl::CUDA_MEMORY);
+ h.cuda_handle().reset(A->cuda_mem);
+ h.cuda_handle().inc();
+ if (A->precision == ViennaCLFloat)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+ else if (A->precision == ViennaCLDouble)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+#else
+ (void)h;
+ (void)A;
+ return ViennaCLGenericFailure;
+#endif
+}
+
+static ViennaCLStatus init_opencl_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+#ifdef VIENNACL_WITH_OPENCL
+ h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ h.opencl_handle() = A->opencl_mem;
+ h.opencl_handle().inc();
+ if (A->precision == ViennaCLFloat)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+ else if (A->precision == ViennaCLDouble)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+#else
+ (void)h;
+ (void)A;
+ return ViennaCLGenericFailure;
+#endif
+}
+
+
+static ViennaCLStatus init_host_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+ h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+ h.ram_handle().reset(A->host_mem);
+ h.ram_handle().inc();
+ if (A->precision == ViennaCLFloat)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+ else if (A->precision == ViennaCLDouble)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+}
+
+
+static ViennaCLStatus init_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+ switch (A->backend->backend_type)
+ {
+ case ViennaCLCUDA:
+ return init_cuda_matrix(h, A);
+
+ case ViennaCLOpenCL:
+ return init_opencl_matrix(h, A);
+
+ case ViennaCLHost:
+ return init_host_matrix(h, A);
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
new file mode 100644
index 0000000..8be00d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
@@ -0,0 +1,101 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl.hpp"
+#include "viennacl/backend/mem_handle.hpp"
+
+
+
+static ViennaCLStatus init_cuda_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+#ifdef VIENNACL_WITH_CUDA
+ h.switch_active_handle_id(viennacl::CUDA_MEMORY);
+ h.cuda_handle().reset(x->cuda_mem);
+ h.cuda_handle().inc();
+ if (x->precision == ViennaCLFloat)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * x->size * sizeof(float)); // not necessary, but still set for conciseness
+ else if (x->precision == ViennaCLDouble)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * x->size * sizeof(double)); // not necessary, but still set for conciseness
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+#else
+ (void)h;
+ (void)x;
+ return ViennaCLGenericFailure;
+#endif
+}
+
+static ViennaCLStatus init_opencl_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+#ifdef VIENNACL_WITH_OPENCL
+ h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+ h.opencl_handle() = x->opencl_mem;
+ h.opencl_handle().inc();
+ if (x->precision == ViennaCLFloat)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(float)); // not necessary, but still set for conciseness
+ else if (x->precision == ViennaCLDouble)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(double)); // not necessary, but still set for conciseness
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+#else
+ (void)h;
+ (void)x;
+ return ViennaCLGenericFailure;
+#endif
+}
+
+
+static ViennaCLStatus init_host_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+ h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+ h.ram_handle().reset(x->host_mem);
+ h.ram_handle().inc();
+ if (x->precision == ViennaCLFloat)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(float)); // not necessary, but still set for conciseness
+ else if (x->precision == ViennaCLDouble)
+ h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(double)); // not necessary, but still set for conciseness
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+}
+
+
+static ViennaCLStatus init_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+ switch (x->backend->backend_type)
+ {
+ case ViennaCLCUDA:
+ return init_cuda_vector(h, x);
+
+ case ViennaCLOpenCL:
+ return init_opencl_vector(h, x);
+
+ case ViennaCLHost:
+ return init_host_vector(h, x);
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
new file mode 100644
index 0000000..c66c848
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
@@ -0,0 +1,141 @@
+#ifndef VIENNACL_VIENNACL_PRIVATE_HPP
+#define VIENNACL_VIENNACL_PRIVATE_HPP
+
+
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <stdlib.h>
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+#include "viennacl.hpp"
+
+
+/************* Backend Management ******************/
+
+struct ViennaCLCUDABackend_impl
+{
+ //TODO: Add stream and/or device descriptors here
+};
+
+struct ViennaCLOpenCLBackend_impl
+{
+ ViennaCLInt context_id;
+};
+
+struct ViennaCLHostBackend_impl
+{
+ // Nothing to specify *at the moment*
+};
+
+
+/** @brief Generic backend for CUDA, OpenCL, host-based stuff */
+struct ViennaCLBackend_impl
+{
+ ViennaCLBackendTypes backend_type;
+
+ ViennaCLCUDABackend_impl cuda_backend;
+ ViennaCLOpenCLBackend_impl opencl_backend;
+ ViennaCLHostBackend_impl host_backend;
+};
+
+
+
+/******** User Types **********/
+
+struct ViennaCLHostScalar_impl
+{
+ ViennaCLPrecision precision;
+
+ union {
+ float value_float;
+ double value_double;
+ };
+};
+
+struct ViennaCLScalar_impl
+{
+ ViennaCLBackend backend;
+ ViennaCLPrecision precision;
+
+ // buffer:
+#ifdef VIENNACL_WITH_CUDA
+ char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+ cl_mem opencl_mem;
+#endif
+ char * host_mem;
+
+ ViennaCLInt offset;
+};
+
+struct ViennaCLVector_impl
+{
+ ViennaCLBackend backend;
+ ViennaCLPrecision precision;
+
+ // buffer:
+#ifdef VIENNACL_WITH_CUDA
+ char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+ cl_mem opencl_mem;
+#endif
+ char * host_mem;
+
+ ViennaCLInt offset;
+ ViennaCLInt inc;
+ ViennaCLInt size;
+};
+
+struct ViennaCLMatrix_impl
+{
+ ViennaCLBackend backend;
+ ViennaCLPrecision precision;
+ ViennaCLOrder order;
+ ViennaCLTranspose trans;
+
+ // buffer:
+#ifdef VIENNACL_WITH_CUDA
+ char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+ cl_mem opencl_mem;
+#endif
+ char * host_mem;
+
+ ViennaCLInt size1;
+ ViennaCLInt start1;
+ ViennaCLInt stride1;
+ ViennaCLInt internal_size1;
+
+ ViennaCLInt size2;
+ ViennaCLInt start2;
+ ViennaCLInt stride2;
+ ViennaCLInt internal_size2;
+};
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
new file mode 100644
index 0000000..ccfd035
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
@@ -0,0 +1,171 @@
+#ifndef VIENNACL_BACKEND_CPU_RAM_HPP_
+#define VIENNACL_BACKEND_CPU_RAM_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/cpu_ram.hpp
+ @brief Implementations for the OpenCL backend functionality
+*/
+
+#include <cassert>
+#include <vector>
+#ifdef VIENNACL_WITH_AVX2
+#include <stdlib.h>
+#endif
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+
+namespace viennacl
+{
+namespace backend
+{
+namespace cpu_ram
+{
+typedef viennacl::tools::shared_ptr<char> handle_type;
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+// dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+// dest, offset, size)
+// *
+//
+
+namespace detail
+{
+ /** @brief Helper struct for deleting an pointer to an array */
+ template<class U>
+ struct array_deleter
+ {
+#ifdef VIENNACL_WITH_AVX2
+ void operator()(U* p) const { free(p); }
+#else
+ void operator()(U* p) const { delete[] p; }
+#endif
+ };
+
+}
+
+/** @brief Creates an array of the specified size in main RAM. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes Number of bytes to allocate
+ * @param host_ptr Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ *
+ */
+inline handle_type memory_create(vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+#ifdef VIENNACL_WITH_AVX2
+ // Note: aligned_alloc not available on all compilers. Consider platform-specific alternatives such as posix_memalign()
+ if (!host_ptr)
+ return handle_type(reinterpret_cast<char*>(aligned_alloc(32, size_in_bytes)), detail::array_deleter<char>());
+
+ handle_type new_handle(reinterpret_cast<char*>(aligned_alloc(32, size_in_bytes)), detail::array_deleter<char>());
+#else
+ if (!host_ptr)
+ return handle_type(new char[size_in_bytes], detail::array_deleter<char>());
+
+ handle_type new_handle(new char[size_in_bytes], detail::array_deleter<char>());
+#endif
+
+ // copy data:
+ char * raw_ptr = new_handle.get();
+ const char * data_ptr = static_cast<const char *>(host_ptr);
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i=0; i<long(size_in_bytes); ++i)
+ raw_ptr[i] = data_ptr[i];
+
+ return new_handle;
+}
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'dst_buffer + dst_offset'.
+ *
+ * @param src_buffer A smart pointer to the begin of an allocated buffer
+ * @param dst_buffer A smart pointer to the end of an allocated buffer
+ * @param src_offset Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ * @param dst_offset Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ */
+inline void memory_copy(handle_type const & src_buffer,
+ handle_type & dst_buffer,
+ vcl_size_t src_offset,
+ vcl_size_t dst_offset,
+ vcl_size_t bytes_to_copy)
+{
+ assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+ assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i=0; i<long(bytes_to_copy); ++i)
+ dst_buffer.get()[vcl_size_t(i)+dst_offset] = src_buffer.get()[vcl_size_t(i) + src_offset];
+}
+
+/** @brief Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer A smart pointer to the beginning of an allocated buffer
+ * @param dst_offset Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr Pointer to the first byte to be written
+ */
+inline void memory_write(handle_type & dst_buffer,
+ vcl_size_t dst_offset,
+ vcl_size_t bytes_to_copy,
+ const void * ptr,
+ bool /*async*/)
+{
+ assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i=0; i<long(bytes_to_copy); ++i)
+ dst_buffer.get()[vcl_size_t(i)+dst_offset] = static_cast<const char *>(ptr)[i];
+}
+
+/** @brief Reads data from a buffer back to main RAM.
+ *
+ * @param src_buffer A smart pointer to the beginning of an allocated source buffer
+ * @param src_offset Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy Number of bytes to be read
+ * @param ptr Location in main RAM where to read data should be written to
+ */
+inline void memory_read(handle_type const & src_buffer,
+ vcl_size_t src_offset,
+ vcl_size_t bytes_to_copy,
+ void * ptr,
+ bool /*async*/)
+{
+ assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i=0; i<long(bytes_to_copy); ++i)
+ static_cast<char *>(ptr)[i] = src_buffer.get()[vcl_size_t(i)+src_offset];
+}
+
+}
+} //backend
+} //viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
new file mode 100644
index 0000000..641bfea
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
@@ -0,0 +1,206 @@
+#ifndef VIENNACL_BACKEND_CUDA_HPP_
+#define VIENNACL_BACKEND_CUDA_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/cuda.hpp
+ @brief Implementations for the CUDA backend functionality
+*/
+
+
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <stdexcept>
+#include <sstream>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+#define VIENNACL_CUDA_ERROR_CHECK(err) detail::cuda_error_check (err, __FILE__, __LINE__)
+
+namespace viennacl
+{
+namespace backend
+{
+namespace cuda
+{
+
+typedef viennacl::tools::shared_ptr<char> handle_type;
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+// dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+// dest, offset, size)
+// *
+//
+
+class cuda_exception : public std::runtime_error
+{
+public:
+ cuda_exception(std::string const & what_arg, cudaError_t err_code) : std::runtime_error(what_arg), error_code_(err_code) {}
+
+ cudaError_t error_code() const { return error_code_; }
+
+private:
+ cudaError_t error_code_;
+};
+
+namespace detail
+{
+
+ inline void cuda_error_check(cudaError error_code, const char *file, const int line )
+ {
+ if (cudaSuccess != error_code)
+ {
+ std::stringstream ss;
+ ss << file << "(" << line << "): " << ": CUDA Runtime API error " << error_code << ": " << cudaGetErrorString( error_code ) << std::endl;
+ throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code);
+ }
+ }
+
+
+ /** @brief Functor for deleting a CUDA handle. Used within the smart pointer class. */
+ template<typename U>
+ struct cuda_deleter
+ {
+ void operator()(U * p) const
+ {
+ //std::cout << "Freeing handle " << reinterpret_cast<void *>(p) << std::endl;
+ cudaFree(p);
+ }
+ };
+
+}
+
+/** @brief Creates an array of the specified size on the CUDA device. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes Number of bytes to allocate
+ * @param host_ptr Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ *
+ */
+inline handle_type memory_create(vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+ void * dev_ptr = NULL;
+ VIENNACL_CUDA_ERROR_CHECK( cudaMalloc(&dev_ptr, size_in_bytes) );
+ //std::cout << "Allocated new dev_ptr " << dev_ptr << " of size " << size_in_bytes << std::endl;
+
+ if (!host_ptr)
+ return handle_type(reinterpret_cast<char *>(dev_ptr), detail::cuda_deleter<char>());
+
+ handle_type new_handle(reinterpret_cast<char*>(dev_ptr), detail::cuda_deleter<char>());
+
+ // copy data:
+ //std::cout << "Filling new handle from host_ptr " << host_ptr << std::endl;
+ cudaMemcpy(new_handle.get(), host_ptr, size_in_bytes, cudaMemcpyHostToDevice);
+
+ return new_handle;
+}
+
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' on the CUDA device to memory starting at address 'dst_buffer + dst_offset' on the same CUDA device.
+ *
+ * @param src_buffer A smart pointer to the begin of an allocated CUDA buffer
+ * @param dst_buffer A smart pointer to the end of an allocated CUDA buffer
+ * @param src_offset Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ * @param dst_offset Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ */
+inline void memory_copy(handle_type const & src_buffer,
+ handle_type & dst_buffer,
+ vcl_size_t src_offset,
+ vcl_size_t dst_offset,
+ vcl_size_t bytes_to_copy)
+{
+ assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+ assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+ cudaMemcpy(reinterpret_cast<void *>(dst_buffer.get() + dst_offset),
+ reinterpret_cast<void *>(src_buffer.get() + src_offset),
+ bytes_to_copy,
+ cudaMemcpyDeviceToDevice);
+}
+
+
+/** @brief Writes data from main RAM identified by 'ptr' to the CUDA buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer A smart pointer to the beginning of an allocated CUDA buffer
+ * @param dst_offset Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr Pointer to the first byte to be written
+ * @param async Whether the operation should be asynchronous
+ */
+inline void memory_write(handle_type & dst_buffer,
+ vcl_size_t dst_offset,
+ vcl_size_t bytes_to_copy,
+ const void * ptr,
+ bool async = false)
+{
+ assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+ if (async)
+ cudaMemcpyAsync(reinterpret_cast<char *>(dst_buffer.get()) + dst_offset,
+ reinterpret_cast<const char *>(ptr),
+ bytes_to_copy,
+ cudaMemcpyHostToDevice);
+ else
+ cudaMemcpy(reinterpret_cast<char *>(dst_buffer.get()) + dst_offset,
+ reinterpret_cast<const char *>(ptr),
+ bytes_to_copy,
+ cudaMemcpyHostToDevice);
+}
+
+
+/** @brief Reads data from a CUDA buffer back to main RAM.
+ *
+ * @param src_buffer A smart pointer to the beginning of an allocated CUDA source buffer
+ * @param src_offset Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy Number of bytes to be read
+ * @param ptr Location in main RAM where to read data should be written to
+ * @param async Whether the operation should be asynchronous
+ */
+inline void memory_read(handle_type const & src_buffer,
+ vcl_size_t src_offset,
+ vcl_size_t bytes_to_copy,
+ void * ptr,
+ bool async = false)
+{
+ assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+ if (async)
+ cudaMemcpyAsync(reinterpret_cast<char *>(ptr),
+ reinterpret_cast<char *>(src_buffer.get()) + src_offset,
+ bytes_to_copy,
+ cudaMemcpyDeviceToHost);
+ else
+ cudaMemcpy(reinterpret_cast<char *>(ptr),
+ reinterpret_cast<char *>(src_buffer.get()) + src_offset,
+ bytes_to_copy,
+ cudaMemcpyDeviceToHost);
+}
+
+} //cuda
+} //backend
+} //viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
new file mode 100644
index 0000000..37c680b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
@@ -0,0 +1,250 @@
+#ifndef VIENNACL_BACKEND_MEM_HANDLE_HPP
+#define VIENNACL_BACKEND_MEM_HANDLE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/mem_handle.hpp
+ @brief Implements the multi-memory-domain handle
+*/
+
+#include <vector>
+#include <cassert>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+#include "viennacl/backend/cpu_ram.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/backend/cuda.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+
+namespace detail
+{
+ /** @brief Singleton for managing the default memory type.
+ *
+ * @param new_mem_type If NULL, returns the current memory type. Otherwise, sets the memory type to the provided value.
+ */
+ inline memory_types get_set_default_memory_type(memory_types * new_mem_type)
+ {
+ // if a user compiles with CUDA, it is reasonable to expect that CUDA should be the default
+#ifdef VIENNACL_WITH_CUDA
+ static memory_types mem_type = CUDA_MEMORY;
+#elif defined(VIENNACL_WITH_OPENCL)
+ static memory_types mem_type = OPENCL_MEMORY;
+#else
+ static memory_types mem_type = MAIN_MEMORY;
+#endif
+
+ if (new_mem_type)
+ mem_type = *new_mem_type;
+
+ return mem_type;
+ }
+}
+
+/** @brief Returns the default memory type for the given configuration.
+ *
+ * CUDA has precedence over OpenCL, which has precedence over main memory. Depends on which VIENNACL_WITH_{CUDA/OPENCL/OPENMP} macros are defined.
+ */
+inline memory_types default_memory_type() { return detail::get_set_default_memory_type(NULL); }
+
+/** @brief Sets the default memory type for the given configuration.
+ *
+ * Make sure the respective new memory type is enabled.
+ * For example, passing CUDA_MEMORY if no CUDA backend is selected will result in exceptions being thrown as soon as you try to allocate buffers.
+ */
+inline memory_types default_memory_type(memory_types new_memory_type) { return detail::get_set_default_memory_type(&new_memory_type); }
+
+
+/** @brief Main abstraction class for multiple memory domains. Represents a buffer in either main RAM, an OpenCL context, or a CUDA device.
+ *
+ * The idea is to wrap all possible handle types inside this class so that higher-level code does not need to be cluttered with preprocessor switches.
+ * Instead, this class collects all the necessary conditional compilations.
+ *
+ */
+class mem_handle
+{
+public:
+ typedef viennacl::tools::shared_ptr<char> ram_handle_type;
+ typedef viennacl::tools::shared_ptr<char> cuda_handle_type;
+
+ /** @brief Default CTOR. No memory is allocated */
+ mem_handle() : active_handle_(MEMORY_NOT_INITIALIZED), size_in_bytes_(0) {}
+
+ /** @brief Returns the handle to a buffer in CPU RAM. NULL is returned if no such buffer has been allocated. */
+ ram_handle_type & ram_handle() { return ram_handle_; }
+ /** @brief Returns the handle to a buffer in CPU RAM. NULL is returned if no such buffer has been allocated. */
+ ram_handle_type const & ram_handle() const { return ram_handle_; }
+
+#ifdef VIENNACL_WITH_OPENCL
+ /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
+ viennacl::ocl::handle<cl_mem> & opencl_handle() { return opencl_handle_; }
+ /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
+ viennacl::ocl::handle<cl_mem> const & opencl_handle() const { return opencl_handle_; }
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ /** @brief Returns the handle to a CUDA buffer. The handle contains NULL if no such buffer has been allocated. */
+ cuda_handle_type & cuda_handle() { return cuda_handle_; }
+ /** @brief Returns the handle to a CUDA buffer. The handle contains NULL if no such buffer has been allocated. */
+ cuda_handle_type const & cuda_handle() const { return cuda_handle_; }
+#endif
+
+ /** @brief Returns an ID for the currently active memory buffer. Other memory buffers might contain old or no data. */
+ memory_types get_active_handle_id() const { return active_handle_; }
+
+ /** @brief Switches the currently active handle. If no support for that backend is provided, an exception is thrown. */
+ void switch_active_handle_id(memory_types new_id)
+ {
+ if (new_id != active_handle_)
+ {
+ if (active_handle_ == MEMORY_NOT_INITIALIZED)
+ active_handle_ = new_id;
+ else if (active_handle_ == MAIN_MEMORY)
+ {
+ active_handle_ = new_id;
+ }
+ else if (active_handle_ == OPENCL_MEMORY)
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ active_handle_ = new_id;
+#else
+ throw memory_exception("compiled without OpenCL suppport!");
+#endif
+ }
+ else if (active_handle_ == CUDA_MEMORY)
+ {
+#ifdef VIENNACL_WITH_CUDA
+ active_handle_ = new_id;
+#else
+ throw memory_exception("compiled without CUDA suppport!");
+#endif
+ }
+ else
+ throw memory_exception("invalid new memory region!");
+ }
+ }
+
+ /** @brief Compares the two handles and returns true if the active memory handles in the two mem_handles point to the same buffer. */
+ bool operator==(mem_handle const & other) const
+ {
+ if (active_handle_ != other.active_handle_)
+ return false;
+
+ switch (active_handle_)
+ {
+ case MAIN_MEMORY:
+ return ram_handle_.get() == other.ram_handle_.get();
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ return opencl_handle_.get() == other.opencl_handle_.get();
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ return cuda_handle_.get() == other.cuda_handle_.get();
+#endif
+ default: break;
+ }
+
+ return false;
+ }
+
+ /** @brief Compares the two handles and returns true if the active memory handles in the two mem_handles point a buffer with inferior address
+ * useful to store handles into a map, since they naturally have strong ordering
+ */
+ bool operator<(mem_handle const & other) const
+ {
+ if (active_handle_ != other.active_handle_)
+ return false;
+
+ switch (active_handle_)
+ {
+ case MAIN_MEMORY:
+ return ram_handle_.get() < other.ram_handle_.get();
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ return opencl_handle_.get() < other.opencl_handle_.get();
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ return cuda_handle_.get() < other.cuda_handle_.get();
+#endif
+ default: break;
+ }
+
+ return false;
+ }
+
+
+ bool operator!=(mem_handle const & other) const { return !(*this == other); }
+
+ /** @brief Implements a fast swapping method. No data is copied, only the handles are exchanged. */
+ void swap(mem_handle & other)
+ {
+ // swap handle type:
+ memory_types active_handle_tmp = other.active_handle_;
+ other.active_handle_ = active_handle_;
+ active_handle_ = active_handle_tmp;
+
+ // swap ram handle:
+ ram_handle_type ram_handle_tmp = other.ram_handle_;
+ other.ram_handle_ = ram_handle_;
+ ram_handle_ = ram_handle_tmp;
+
+ // swap OpenCL handle:
+#ifdef VIENNACL_WITH_OPENCL
+ opencl_handle_.swap(other.opencl_handle_);
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ cuda_handle_type cuda_handle_tmp = other.cuda_handle_;
+ other.cuda_handle_ = cuda_handle_;
+ cuda_handle_ = cuda_handle_tmp;
+#endif
+ }
+
+ /** @brief Returns the number of bytes of the currently active buffer */
+ vcl_size_t raw_size() const { return size_in_bytes_; }
+
+ /** @brief Sets the size of the currently active buffer. Use with care! */
+ void raw_size(vcl_size_t new_size) { size_in_bytes_ = new_size; }
+
+private:
+ memory_types active_handle_;
+ ram_handle_type ram_handle_;
+#ifdef VIENNACL_WITH_OPENCL
+ viennacl::ocl::handle<cl_mem> opencl_handle_;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ cuda_handle_type cuda_handle_;
+#endif
+ vcl_size_t size_in_bytes_;
+};
+
+
+} //backend
+} //viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
new file mode 100644
index 0000000..d6f29a5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
@@ -0,0 +1,628 @@
+#ifndef VIENNACL_BACKEND_MEMORY_HPP
+#define VIENNACL_BACKEND_MEMORY_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/memory.hpp
+ @brief Main interface routines for memory management
+*/
+
+#include <vector>
+#include <cassert>
+#include "viennacl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+#include "viennacl/context.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/backend/util.hpp"
+
+#include "viennacl/backend/cpu_ram.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#include "viennacl/ocl/backend.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/backend/cuda.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+
+
+ // if a user compiles with CUDA, it is reasonable to expect that CUDA should be the default
+ /** @brief Synchronizes the execution. finish() will only return after all compute kernels (CUDA, OpenCL) have completed. */
+ inline void finish()
+ {
+#ifdef VIENNACL_WITH_CUDA
+ cudaDeviceSynchronize();
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+ viennacl::ocl::get_queue().finish();
+#endif
+ }
+
+
+
+
+ // Requirements for backend:
+
+ // ---- Memory ----
+ //
+ // * memory_create(size, host_ptr)
+ // * memory_copy(src, dest, offset_src, offset_dest, size)
+ // * memory_write(src, offset, size, ptr)
+ // * memory_read(src, offset, size, ptr)
+ //
+
+ /** @brief Creates an array of the specified size. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * This is the generic version for CPU RAM, CUDA, and OpenCL. Creates the memory in the currently active memory domain.
+ *
+ * @param handle The generic wrapper handle for multiple memory domains which will hold the new buffer.
+ * @param size_in_bytes Number of bytes to allocate
+ * @param ctx Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+ * @param host_ptr Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ *
+ */
+ inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, viennacl::context const & ctx, const void * host_ptr = NULL)
+ {
+ if (size_in_bytes > 0)
+ {
+ if (handle.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+ handle.switch_active_handle_id(ctx.memory_type());
+
+ switch (handle.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ handle.ram_handle() = cpu_ram::memory_create(size_in_bytes, host_ptr);
+ handle.raw_size(size_in_bytes);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ handle.opencl_handle().context(ctx.opencl_context());
+ handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), size_in_bytes, host_ptr);
+ handle.raw_size(size_in_bytes);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ handle.cuda_handle() = cuda::memory_create(size_in_bytes, host_ptr);
+ handle.raw_size(size_in_bytes);
+ break;
+#endif
+ case MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("unknown memory handle!");
+ }
+ }
+ }
+
+ /*
+ inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+ {
+ viennacl::context ctx(default_memory_type());
+ memory_create(handle, size_in_bytes, ctx, host_ptr);
+ }*/
+
+
+ /** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'dst_buffer + dst_offset'.
+ *
+ * This is the generic version for CPU RAM, CUDA, and OpenCL. Copies the memory in the currently active memory domain.
+ *
+ *
+ * @param src_buffer A smart pointer to the begin of an allocated buffer
+ * @param dst_buffer A smart pointer to the end of an allocated buffer
+ * @param src_offset Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ * @param dst_offset Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ */
+ inline void memory_copy(mem_handle const & src_buffer,
+ mem_handle & dst_buffer,
+ vcl_size_t src_offset,
+ vcl_size_t dst_offset,
+ vcl_size_t bytes_to_copy)
+ {
+ assert( src_buffer.get_active_handle_id() == dst_buffer.get_active_handle_id() && bool("memory_copy() must be called on buffers from the same domain") );
+
+ if (bytes_to_copy > 0)
+ {
+ switch (src_buffer.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ cpu_ram::memory_copy(src_buffer.ram_handle(), dst_buffer.ram_handle(), src_offset, dst_offset, bytes_to_copy);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ opencl::memory_copy(src_buffer.opencl_handle(), dst_buffer.opencl_handle(), src_offset, dst_offset, bytes_to_copy);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ cuda::memory_copy(src_buffer.cuda_handle(), dst_buffer.cuda_handle(), src_offset, dst_offset, bytes_to_copy);
+ break;
+#endif
+ case MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("unknown memory handle!");
+ }
+ }
+ }
+
+ // TODO: Refine this concept. Maybe move to constructor?
+ /** @brief A 'shallow' copy operation from an initialized buffer to an uninitialized buffer.
+ * The uninitialized buffer just copies the raw handle.
+ */
+ inline void memory_shallow_copy(mem_handle const & src_buffer,
+ mem_handle & dst_buffer)
+ {
+ assert( (dst_buffer.get_active_handle_id() == MEMORY_NOT_INITIALIZED) && bool("Shallow copy on already initialized memory not supported!"));
+
+ switch (src_buffer.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+ dst_buffer.ram_handle() = src_buffer.ram_handle();
+ dst_buffer.raw_size(src_buffer.raw_size());
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+ dst_buffer.opencl_handle() = src_buffer.opencl_handle();
+ dst_buffer.raw_size(src_buffer.raw_size());
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+ dst_buffer.cuda_handle() = src_buffer.cuda_handle();
+ dst_buffer.raw_size(src_buffer.raw_size());
+ break;
+#endif
+ case MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("unknown memory handle!");
+ }
+ }
+
+ /** @brief Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'
+ *
+ * This is the generic version for CPU RAM, CUDA, and OpenCL. Writes the memory in the currently active memory domain.
+ *
+ * @param dst_buffer A smart pointer to the beginning of an allocated buffer
+ * @param dst_offset Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_write Number of bytes to be written
+ * @param ptr Pointer to the first byte to be written
+ * @param async Whether the operation should be asynchronous
+ */
+ inline void memory_write(mem_handle & dst_buffer,
+ vcl_size_t dst_offset,
+ vcl_size_t bytes_to_write,
+ const void * ptr,
+ bool async = false)
+ {
+ if (bytes_to_write > 0)
+ {
+ switch (dst_buffer.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ cpu_ram::memory_write(dst_buffer.ram_handle(), dst_offset, bytes_to_write, ptr, async);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ opencl::memory_write(dst_buffer.opencl_handle(), dst_offset, bytes_to_write, ptr, async);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ cuda::memory_write(dst_buffer.cuda_handle(), dst_offset, bytes_to_write, ptr, async);
+ break;
+#endif
+ case MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("unknown memory handle!");
+ }
+ }
+ }
+
+ /** @brief Reads data from a buffer back to main RAM.
+ *
+ * This is the generic version for CPU RAM, CUDA, and OpenCL. Reads the memory from the currently active memory domain.
+ *
+ * @param src_buffer A smart pointer to the beginning of an allocated source buffer
+ * @param src_offset Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_read Number of bytes to be read
+ * @param ptr Location in main RAM where to read data should be written to
+ * @param async Whether the operation should be asynchronous
+ */
+ inline void memory_read(mem_handle const & src_buffer,
+ vcl_size_t src_offset,
+ vcl_size_t bytes_to_read,
+ void * ptr,
+ bool async = false)
+ {
+ //finish(); //Fixes some issues with AMD APP SDK. However, might sacrifice a few percents of performance in some cases.
+
+ if (bytes_to_read > 0)
+ {
+ switch (src_buffer.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ cpu_ram::memory_read(src_buffer.ram_handle(), src_offset, bytes_to_read, ptr, async);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ opencl::memory_read(src_buffer.opencl_handle(), src_offset, bytes_to_read, ptr, async);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ cuda::memory_read(src_buffer.cuda_handle(), src_offset, bytes_to_read, ptr, async);
+ break;
+#endif
+ case MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("unknown memory handle!");
+ }
+ }
+ }
+
+
+
+ namespace detail
+ {
+ template<typename T>
+ vcl_size_t element_size(memory_types /* mem_type */)
+ {
+ return sizeof(T);
+ }
+
+
+ template<>
+ inline vcl_size_t element_size<unsigned long>(memory_types
+ #ifdef VIENNACL_WITH_OPENCL
+ mem_type //in order to compile cleanly at -Wextra in GCC
+ #endif
+ )
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ if (mem_type == OPENCL_MEMORY)
+ return sizeof(cl_ulong);
+#endif
+ return sizeof(unsigned long);
+ }
+
+ template<>
+ inline vcl_size_t element_size<long>(memory_types
+ #ifdef VIENNACL_WITH_OPENCL
+ mem_type //in order to compile cleanly at -Wextra in GCC
+ #endif
+ )
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ if (mem_type == OPENCL_MEMORY)
+ return sizeof(cl_long);
+#endif
+ return sizeof(long);
+ }
+
+
+ template<>
+ inline vcl_size_t element_size<unsigned int>(memory_types
+ #ifdef VIENNACL_WITH_OPENCL
+ mem_type //in order to compile cleanly at -Wextra in GCC
+ #endif
+ )
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ if (mem_type == OPENCL_MEMORY)
+ return sizeof(cl_uint);
+#endif
+ return sizeof(unsigned int);
+ }
+
+ template<>
+ inline vcl_size_t element_size<int>(memory_types
+ #ifdef VIENNACL_WITH_OPENCL
+ mem_type //in order to compile cleanly at -Wextra in GCC
+ #endif
+ )
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ if (mem_type == OPENCL_MEMORY)
+ return sizeof(cl_int);
+#endif
+ return sizeof(int);
+ }
+
+
+ }
+
+
+ /** @brief Switches the active memory domain within a memory handle. Data is copied if the new active domain differs from the old one. Memory in the source handle is not free'd. */
+ template<typename DataType>
+ void switch_memory_context(mem_handle & handle, viennacl::context new_ctx)
+ {
+ if (handle.get_active_handle_id() == new_ctx.memory_type())
+ return;
+
+ if (handle.get_active_handle_id() == viennacl::MEMORY_NOT_INITIALIZED || handle.raw_size() == 0)
+ {
+ handle.switch_active_handle_id(new_ctx.memory_type());
+#ifdef VIENNACL_WITH_OPENCL
+ if (new_ctx.memory_type() == OPENCL_MEMORY)
+ handle.opencl_handle().context(new_ctx.opencl_context());
+#endif
+ return;
+ }
+
+ vcl_size_t size_dst = detail::element_size<DataType>(handle.get_active_handle_id());
+ vcl_size_t size_src = detail::element_size<DataType>(new_ctx.memory_type());
+
+ if (size_dst != size_src) // OpenCL data element size not the same as host data element size
+ {
+ throw memory_exception("Heterogeneous data element sizes not yet supported!");
+ }
+ else //no data conversion required
+ {
+ if (handle.get_active_handle_id() == MAIN_MEMORY) //we can access the existing data directly
+ {
+ switch (new_ctx.memory_type())
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ handle.opencl_handle().context(new_ctx.opencl_context());
+ handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), handle.raw_size(), handle.ram_handle().get());
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ handle.cuda_handle() = cuda::memory_create(handle.raw_size(), handle.ram_handle().get());
+ break;
+#endif
+ case MAIN_MEMORY:
+ default:
+ throw memory_exception("Invalid destination domain");
+ }
+ }
+#ifdef VIENNACL_WITH_OPENCL
+ else if (handle.get_active_handle_id() == OPENCL_MEMORY) // data can be dumped into destination directly
+ {
+ std::vector<DataType> buffer;
+
+ switch (new_ctx.memory_type())
+ {
+ case MAIN_MEMORY:
+ handle.ram_handle() = cpu_ram::memory_create(handle.raw_size());
+ opencl::memory_read(handle.opencl_handle(), 0, handle.raw_size(), handle.ram_handle().get());
+ break;
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ buffer.resize(handle.raw_size() / sizeof(DataType));
+ opencl::memory_read(handle.opencl_handle(), 0, handle.raw_size(), &(buffer[0]));
+ cuda::memory_create(handle.cuda_handle(), handle.raw_size(), &(buffer[0]));
+ break;
+#endif
+ default:
+ throw memory_exception("Invalid destination domain");
+ }
+ }
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ else //CUDA_MEMORY
+ {
+ std::vector<DataType> buffer;
+
+ // write
+ switch (new_ctx.memory_type())
+ {
+ case MAIN_MEMORY:
+ handle.ram_handle() = cpu_ram::memory_create(handle.raw_size());
+ cuda::memory_read(handle.cuda_handle(), 0, handle.raw_size(), handle.ram_handle().get());
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ buffer.resize(handle.raw_size() / sizeof(DataType));
+ cuda::memory_read(handle.cuda_handle(), 0, handle.raw_size(), &(buffer[0]));
+ handle.opencl_handle() = opencl::memory_create(handle.raw_size(), &(buffer[0]));
+ break;
+#endif
+ default:
+ throw memory_exception("Unsupported source memory domain");
+ }
+ }
+#endif
+
+ // everything succeeded so far, now switch to new domain:
+ handle.switch_active_handle_id(new_ctx.memory_type());
+
+ } // no data conversion
+ }
+
+
+
+ /** @brief Copies data of the provided 'DataType' from 'handle_src' to 'handle_dst' and converts the data if the binary representation of 'DataType' among the memory domains differs. */
+ template<typename DataType>
+ void typesafe_memory_copy(mem_handle const & handle_src, mem_handle & handle_dst)
+ {
+ if (handle_dst.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+ handle_dst.switch_active_handle_id(default_memory_type());
+
+ vcl_size_t element_size_src = detail::element_size<DataType>(handle_src.get_active_handle_id());
+ vcl_size_t element_size_dst = detail::element_size<DataType>(handle_dst.get_active_handle_id());
+
+ if (element_size_src != element_size_dst)
+ {
+ // Data needs to be converted.
+
+ typesafe_host_array<DataType> buffer_src(handle_src);
+ typesafe_host_array<DataType> buffer_dst(handle_dst, handle_src.raw_size() / element_size_src);
+
+ //
+ // Step 1: Fill buffer_dst depending on where the data resides:
+ //
+ DataType const * src_data;
+ switch (handle_src.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ src_data = reinterpret_cast<DataType const *>(handle_src.ram_handle().get());
+ for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+ buffer_dst.set(i, src_data[i]);
+ break;
+
+#ifdef VIENNACL_WITH_OPENCL
+ case OPENCL_MEMORY:
+ buffer_src.resize(handle_src, handle_src.raw_size() / element_size_src);
+ opencl::memory_read(handle_src.opencl_handle(), 0, buffer_src.raw_size(), buffer_src.get());
+ for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+ buffer_dst.set(i, buffer_src[i]);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case CUDA_MEMORY:
+ buffer_src.resize(handle_src, handle_src.raw_size() / element_size_src);
+ cuda::memory_read(handle_src.cuda_handle(), 0, buffer_src.raw_size(), buffer_src.get());
+ for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+ buffer_dst.set(i, buffer_src[i]);
+ break;
+#endif
+
+ default:
+ throw memory_exception("unsupported memory domain");
+ }
+
+ //
+ // Step 2: Write to destination
+ //
+ if (handle_dst.raw_size() == buffer_dst.raw_size())
+ viennacl::backend::memory_write(handle_dst, 0, buffer_dst.raw_size(), buffer_dst.get());
+ else
+ viennacl::backend::memory_create(handle_dst, buffer_dst.raw_size(), viennacl::traits::context(handle_dst), buffer_dst.get());
+
+ }
+ else
+ {
+ // No data conversion required.
+ typesafe_host_array<DataType> buffer(handle_src);
+
+ switch (handle_src.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ switch (handle_dst.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ case OPENCL_MEMORY:
+ case CUDA_MEMORY:
+ if (handle_dst.raw_size() == handle_src.raw_size())
+ viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), handle_src.ram_handle().get());
+ else
+ viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst), handle_src.ram_handle().get());
+ break;
+
+ default:
+ throw memory_exception("unsupported destination memory domain");
+ }
+ break;
+
+ case OPENCL_MEMORY:
+ switch (handle_dst.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ if (handle_dst.raw_size() != handle_src.raw_size())
+ viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+ viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), handle_dst.ram_handle().get());
+ break;
+
+ case OPENCL_MEMORY:
+ if (handle_dst.raw_size() != handle_src.raw_size())
+ viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+ viennacl::backend::memory_copy(handle_src, handle_dst, 0, 0, handle_src.raw_size());
+ break;
+
+ case CUDA_MEMORY:
+ if (handle_dst.raw_size() != handle_src.raw_size())
+ viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+ buffer.resize(handle_src, handle_src.raw_size() / element_size_src);
+ viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), buffer.get());
+ viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), buffer.get());
+ break;
+
+ default:
+ throw memory_exception("unsupported destination memory domain");
+ }
+ break;
+
+ case CUDA_MEMORY:
+ switch (handle_dst.get_active_handle_id())
+ {
+ case MAIN_MEMORY:
+ if (handle_dst.raw_size() != handle_src.raw_size())
+ viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+ viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), handle_dst.ram_handle().get());
+ break;
+
+ case OPENCL_MEMORY:
+ if (handle_dst.raw_size() != handle_src.raw_size())
+ viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+ buffer.resize(handle_src, handle_src.raw_size() / element_size_src);
+ viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), buffer.get());
+ viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), buffer.get());
+ break;
+
+ case CUDA_MEMORY:
+ if (handle_dst.raw_size() != handle_src.raw_size())
+ viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+ viennacl::backend::memory_copy(handle_src, handle_dst, 0, 0, handle_src.raw_size());
+ break;
+
+ default:
+ throw memory_exception("unsupported destination memory domain");
+ }
+ break;
+
+ default:
+ throw memory_exception("unsupported source memory domain");
+ }
+
+ }
+ }
+
+
+} //backend
+
+//
+// Convenience layer:
+//
+/** @brief Generic convenience routine for migrating data of an object to a new memory domain */
+template<typename T>
+void switch_memory_context(T & obj, viennacl::context new_ctx)
+{
+ obj.switch_memory_context(new_ctx);
+}
+
+} //viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
new file mode 100644
index 0000000..a8be55a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
@@ -0,0 +1,151 @@
+#ifndef VIENNACL_BACKEND_OPENCL_HPP_
+#define VIENNACL_BACKEND_OPENCL_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/opencl.hpp
+ @brief Implementations for the OpenCL backend functionality
+*/
+
+
+#include <vector>
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+namespace viennacl
+{
+namespace backend
+{
+namespace opencl
+{
+
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+// dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+// dest, offset, size)
+// *
+//
+
+/** @brief Creates an array of the specified size in the current OpenCL context. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes Number of bytes to allocate
+ * @param host_ptr Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ * @param ctx Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+ *
+ */
+inline cl_mem memory_create(viennacl::ocl::context const & ctx, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+ //std::cout << "Creating buffer (" << size_in_bytes << " bytes) host buffer " << host_ptr << " in context " << &ctx << std::endl;
+ return ctx.create_memory_without_smart_handle(CL_MEM_READ_WRITE, static_cast<unsigned int>(size_in_bytes), const_cast<void *>(host_ptr));
+}
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' in the OpenCL context to memory starting at address 'dst_buffer + dst_offset' in the same OpenCL context.
+ *
+ * @param src_buffer A smart pointer to the begin of an allocated OpenCL buffer
+ * @param dst_buffer A smart pointer to the end of an allocated OpenCL buffer
+ * @param src_offset Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ * @param dst_offset Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ */
+inline void memory_copy(viennacl::ocl::handle<cl_mem> const & src_buffer,
+ viennacl::ocl::handle<cl_mem> & dst_buffer,
+ vcl_size_t src_offset,
+ vcl_size_t dst_offset,
+ vcl_size_t bytes_to_copy)
+{
+ assert( &src_buffer.context() == &dst_buffer.context() && bool("Transfer between memory buffers in different contexts not supported yet!"));
+
+ viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context());
+ cl_int err = clEnqueueCopyBuffer(memory_context.get_queue().handle().get(),
+ src_buffer.get(),
+ dst_buffer.get(),
+ src_offset,
+ dst_offset,
+ bytes_to_copy,
+ 0, NULL, NULL); //events
+ VIENNACL_ERR_CHECK(err);
+}
+
+
+/** @brief Writes data from main RAM identified by 'ptr' to the OpenCL buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer A smart pointer to the beginning of an allocated OpenCL buffer
+ * @param dst_offset Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr Pointer to the first byte to be written
+ * @param async Whether the operation should be asynchronous
+ */
+inline void memory_write(viennacl::ocl::handle<cl_mem> & dst_buffer,
+ vcl_size_t dst_offset,
+ vcl_size_t bytes_to_copy,
+ const void * ptr,
+ bool async = false)
+{
+
+ viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(dst_buffer.context());
+
+#if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
+ std::cout << "Writing data (" << bytes_to_copy << " bytes, offset " << dst_offset << ") to OpenCL buffer " << dst_buffer.get() << " with queue " << memory_context.get_queue().handle().get() << " from " << ptr << std::endl;
+#endif
+
+ cl_int err = clEnqueueWriteBuffer(memory_context.get_queue().handle().get(),
+ dst_buffer.get(),
+ async ? CL_FALSE : CL_TRUE, //blocking
+ dst_offset,
+ bytes_to_copy,
+ ptr,
+ 0, NULL, NULL); //events
+ VIENNACL_ERR_CHECK(err);
+}
+
+
+/** @brief Reads data from an OpenCL buffer back to main RAM.
+ *
+ * @param src_buffer A smart pointer to the beginning of an allocated OpenCL source buffer
+ * @param src_offset Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy Number of bytes to be read
+ * @param ptr Location in main RAM where to read data should be written to
+ * @param async Whether the operation should be asynchronous
+ */
+inline void memory_read(viennacl::ocl::handle<cl_mem> const & src_buffer,
+ vcl_size_t src_offset,
+ vcl_size_t bytes_to_copy,
+ void * ptr,
+ bool async = false)
+{
+ //std::cout << "Reading data (" << bytes_to_copy << " bytes, offset " << src_offset << ") from OpenCL buffer " << src_buffer.get() << " to " << ptr << std::endl;
+ viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context());
+ cl_int err = clEnqueueReadBuffer(memory_context.get_queue().handle().get(),
+ src_buffer.get(),
+ async ? CL_FALSE : CL_TRUE, //blocking
+ src_offset,
+ bytes_to_copy,
+ ptr,
+ 0, NULL, NULL); //events
+ VIENNACL_ERR_CHECK(err);
+}
+
+
+}
+} //backend
+} //viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
new file mode 100644
index 0000000..9aaeb2e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
@@ -0,0 +1,268 @@
+#ifndef VIENNACL_BACKEND_UTIL_HPP
+#define VIENNACL_BACKEND_UTIL_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/util.hpp
+ @brief Helper functionality for working with different memory domains
+*/
+
+#include <vector>
+#include <cassert>
+
+#include "viennacl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+namespace detail
+{
+
+ /** @brief Helper struct for converting a type to its OpenCL pendant. */
+ template<typename T>
+ struct convert_to_opencl
+ {
+ typedef T type;
+ enum { special = 0 };
+ };
+
+#ifdef VIENNACL_WITH_OPENCL
+ template<>
+ struct convert_to_opencl<unsigned int>
+ {
+ typedef cl_uint type;
+ //enum { special = (sizeof(unsigned int) != sizeof(cl_uint)) };
+ enum { special = 1 };
+ };
+
+ template<>
+ struct convert_to_opencl<int>
+ {
+ typedef cl_int type;
+ //enum { special = (sizeof(int) != sizeof(cl_int)) };
+ enum { special = 1 };
+ };
+
+
+ template<>
+ struct convert_to_opencl<unsigned long>
+ {
+ typedef cl_ulong type;
+ //enum { special = (sizeof(unsigned long) != sizeof(cl_ulong)) };
+ enum { special = 1 };
+ };
+
+ template<>
+ struct convert_to_opencl<long>
+ {
+ typedef cl_long type;
+ //enum { special = (sizeof(long) != sizeof(cl_long)) };
+ enum { special = 1 };
+ };
+#endif
+
+
+} //namespace detail
+
+
+/** @brief Helper class implementing an array on the host. Default case: No conversion necessary */
+template<typename T, bool special = detail::convert_to_opencl<T>::special>
+class typesafe_host_array
+{
+ typedef T cpu_type;
+ typedef typename detail::convert_to_opencl<T>::type target_type;
+
+public:
+ explicit typesafe_host_array() : bytes_buffer_(NULL), buffer_size_(0) {}
+
+ explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+ {
+ resize(handle, num);
+ }
+
+ ~typesafe_host_array() { delete[] bytes_buffer_; }
+
+ //
+ // Setter and Getter
+ //
+ void * get() { return reinterpret_cast<void *>(bytes_buffer_); }
+ vcl_size_t raw_size() const { return buffer_size_; }
+ vcl_size_t element_size() const { return sizeof(cpu_type); }
+ vcl_size_t size() const { return buffer_size_ / element_size(); }
+ template<typename U>
+ void set(vcl_size_t index, U value)
+ {
+ reinterpret_cast<cpu_type *>(bytes_buffer_)[index] = static_cast<cpu_type>(value);
+ }
+
+ //
+ // Resize functionality
+ //
+
+ /** @brief Resize without initializing the new memory */
+ void raw_resize(mem_handle const & /*handle*/, vcl_size_t num)
+ {
+ buffer_size_ = sizeof(cpu_type) * num;
+
+ if (num > 0)
+ {
+ delete[] bytes_buffer_;
+
+ bytes_buffer_ = new char[buffer_size_];
+ }
+ }
+
+ /** @brief Resize including initialization of new memory (cf. std::vector<>) */
+ void resize(mem_handle const & handle, vcl_size_t num)
+ {
+ raw_resize(handle, num);
+
+ if (num > 0)
+ {
+ for (vcl_size_t i=0; i<buffer_size_; ++i)
+ bytes_buffer_[i] = 0;
+ }
+ }
+
+ cpu_type operator[](vcl_size_t index) const
+ {
+ assert(index < size() && bool("index out of bounds"));
+
+ return reinterpret_cast<cpu_type *>(bytes_buffer_)[index];
+ }
+
+private:
+ char * bytes_buffer_;
+ vcl_size_t buffer_size_;
+};
+
+
+
+
+/** @brief Special host array type for conversion between OpenCL types and pure CPU types */
+template<typename T>
+class typesafe_host_array<T, true>
+{
+ typedef T cpu_type;
+ typedef typename detail::convert_to_opencl<T>::type target_type;
+
+public:
+ explicit typesafe_host_array() : convert_to_opencl_( (default_memory_type() == OPENCL_MEMORY) ? true : false), bytes_buffer_(NULL), buffer_size_(0) {}
+
+ explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : convert_to_opencl_(false), bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+ {
+ resize(handle, num);
+ }
+
+ ~typesafe_host_array() { delete[] bytes_buffer_; }
+
+ //
+ // Setter and Getter
+ //
+
+ template<typename U>
+ void set(vcl_size_t index, U value)
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ if (convert_to_opencl_)
+ reinterpret_cast<target_type *>(bytes_buffer_)[index] = static_cast<target_type>(value);
+ else
+#endif
+ reinterpret_cast<cpu_type *>(bytes_buffer_)[index] = static_cast<cpu_type>(value);
+ }
+
+ void * get() { return reinterpret_cast<void *>(bytes_buffer_); }
+ cpu_type operator[](vcl_size_t index) const
+ {
+ assert(index < size() && bool("index out of bounds"));
+#ifdef VIENNACL_WITH_OPENCL
+ if (convert_to_opencl_)
+ return static_cast<cpu_type>(reinterpret_cast<target_type *>(bytes_buffer_)[index]);
+#endif
+ return reinterpret_cast<cpu_type *>(bytes_buffer_)[index];
+ }
+
+ vcl_size_t raw_size() const { return buffer_size_; }
+ vcl_size_t element_size() const
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ if (convert_to_opencl_)
+ return sizeof(target_type);
+#endif
+ return sizeof(cpu_type);
+ }
+ vcl_size_t size() const { return buffer_size_ / element_size(); }
+
+ //
+ // Resize functionality
+ //
+
+ /** @brief Resize without initializing the new memory */
+ void raw_resize(mem_handle const & handle, vcl_size_t num)
+ {
+ buffer_size_ = sizeof(cpu_type) * num;
+ (void)handle; //silence unused variable warning if compiled without OpenCL support
+
+#ifdef VIENNACL_WITH_OPENCL
+ memory_types mem_type = handle.get_active_handle_id();
+ if (mem_type == MEMORY_NOT_INITIALIZED)
+ mem_type = default_memory_type();
+
+ if (mem_type == OPENCL_MEMORY)
+ {
+ convert_to_opencl_ = true;
+ buffer_size_ = sizeof(target_type) * num;
+ }
+#endif
+
+ if (num > 0)
+ {
+ delete[] bytes_buffer_;
+
+ bytes_buffer_ = new char[buffer_size_];
+ }
+ }
+
+ /** @brief Resize including initialization of new memory (cf. std::vector<>) */
+ void resize(mem_handle const & handle, vcl_size_t num)
+ {
+ raw_resize(handle, num);
+
+ if (num > 0)
+ {
+ for (vcl_size_t i=0; i<buffer_size_; ++i)
+ bytes_buffer_[i] = 0;
+ }
+ }
+
+private:
+ bool convert_to_opencl_;
+ char * bytes_buffer_;
+ vcl_size_t buffer_size_;
+};
+
+} //backend
+} //viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
new file mode 100644
index 0000000..1ee13d5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
@@ -0,0 +1,359 @@
+#ifndef VIENNACL_CIRCULANT_MATRIX_HPP
+#define VIENNACL_CIRCULANT_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file circulant_matrix.hpp
+ @brief Implementation of the circulant_matrix class for efficient manipulation of circulant matrices. Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+#include "viennacl/linalg/circulant_matrix_operations.hpp"
+
+#include "viennacl/fft.hpp"
+
+namespace viennacl
+{
+/** @brief A Circulant matrix class
+ *
+ * @tparam NumericT The underlying scalar type (either float or double)
+ * @tparam AlignmentV The internal memory size is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+ */
+template<class NumericT, unsigned int AlignmentV>
+class circulant_matrix
+{
+public:
+ typedef viennacl::backend::mem_handle handle_type;
+ typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType> value_type;
+
+ /**
+ * @brief The default constructor. Does not allocate any memory.
+ *
+ */
+ explicit circulant_matrix() {}
+
+ /**
+ * @brief Creates the matrix with the given size
+ *
+ * @param rows Number of rows of the matrix
+ * @param cols Number of columns of the matrix
+ */
+ explicit circulant_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows)
+ {
+ assert(rows == cols && bool("Circulant matrix must be square!"));
+ (void)cols; // avoid 'unused parameter' warning in optimized builds
+ }
+
+ /** @brief Resizes the matrix.
+ * Existing entries can be preserved
+ *
+ * @param sz New size of matrix
+ * @param preserve If true, existing values are preserved.
+ */
+ void resize(vcl_size_t sz, bool preserve = true)
+ {
+ elements_.resize(sz, preserve);
+ }
+
+ /** @brief Returns the OpenCL handle
+ *
+ * @return OpenCL handle
+ */
+ handle_type const & handle() const { return elements_.handle(); }
+
+ /**
+ * @brief Returns an internal viennacl::vector, which represents a circulant matrix elements
+ *
+ */
+ viennacl::vector<NumericT, AlignmentV> & elements() { return elements_; }
+ viennacl::vector<NumericT, AlignmentV> const & elements() const { return elements_; }
+
+ /**
+ * @brief Returns the number of rows of the matrix
+ */
+ vcl_size_t size1() const { return elements_.size(); }
+
+ /**
+ * @brief Returns the number of columns of the matrix
+ */
+ vcl_size_t size2() const { return elements_.size(); }
+
+ /** @brief Returns the internal size of matrix representtion.
+ * Usually required for launching OpenCL kernels only
+ *
+ * @return Internal size of matrix representation
+ */
+ vcl_size_t internal_size() const { return elements_.internal_size(); }
+
+ /**
+ * @brief Read-write access to a single element of the matrix
+ *
+ * @param row_index Row index of accessed element
+ * @param col_index Column index of accessed element
+ * @return Proxy for matrix entry
+ */
+ entry_proxy<NumericT> operator()(vcl_size_t row_index, vcl_size_t col_index)
+ {
+ long index = static_cast<long>(row_index) - static_cast<long>(col_index);
+
+ assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
+
+ while (index < 0)
+ index += static_cast<long>(size1());
+ return elements_[static_cast<vcl_size_t>(index)];
+ }
+
+ /**
+ * @brief += operation for circulant matrices
+ *
+ * @param that Matrix which will be added
+ * @return Result of addition
+ */
+ circulant_matrix<NumericT, AlignmentV>& operator +=(circulant_matrix<NumericT, AlignmentV>& that)
+ {
+ elements_ += that.elements();
+ return *this;
+ }
+
+private:
+ circulant_matrix(circulant_matrix const &) {}
+ circulant_matrix & operator=(circulant_matrix const & t);
+
+ viennacl::vector<NumericT, AlignmentV> elements_;
+};
+
+/** @brief Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU)
+ *
+ *
+ * @param cpu_vec A std::vector on the host.
+ * @param gpu_mat A circulant_matrix from ViennaCL
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(std::vector<NumericT>& cpu_vec, circulant_matrix<NumericT, AlignmentV>& gpu_mat)
+{
+ assert( (gpu_mat.size1() == 0 || cpu_vec.size() == gpu_mat.size1()) && bool("Size mismatch"));
+ copy(cpu_vec, gpu_mat.elements());
+}
+
+/** @brief Copies a circulant matrix from the OpenCL device (either GPU or multi-core CPU) to the std::vector
+ *
+ *
+ * @param gpu_mat A circulant_matrix from ViennaCL
+ * @param cpu_vec A std::vector on the host.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(circulant_matrix<NumericT, AlignmentV>& gpu_mat, std::vector<NumericT>& cpu_vec)
+{
+ assert(cpu_vec.size() == gpu_mat.size1() && bool("Size mismatch"));
+ copy(gpu_mat.elements(), cpu_vec);
+}
+
+/** @brief Copies a circulant matrix from the OpenCL device (either GPU or multi-core CPU) to the matrix-like object
+ *
+ *
+ * @param circ_src A circulant_matrix from ViennaCL
+ * @param com_dst A matrix-like object
+ */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(circulant_matrix<NumericT, AlignmentV>& circ_src, MatrixT& com_dst)
+{
+ vcl_size_t size = circ_src.size1();
+ assert(size == viennacl::traits::size1(com_dst) && bool("Size mismatch"));
+ assert(size == viennacl::traits::size2(com_dst) && bool("Size mismatch"));
+ std::vector<NumericT> tmp(size);
+ copy(circ_src, tmp);
+
+ for (vcl_size_t i = 0; i < size; i++)
+ {
+ for (vcl_size_t j = 0; j < size; j++)
+ {
+ long index = static_cast<long>(i) - static_cast<long>(j);
+ if (index < 0)
+ index += static_cast<long>(size);
+ com_dst(i, j) = tmp[static_cast<vcl_size_t>(index)];
+ }
+ }
+}
+
+/** @brief Copies a the matrix-like object to the circulant matrix from the OpenCL device (either GPU or multi-core CPU)
+ *
+ *
+ * @param com_src A std::vector on the host
+ * @param circ_dst A circulant_matrix from ViennaCL
+ */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(MatrixT& com_src, circulant_matrix<NumericT, AlignmentV>& circ_dst)
+{
+ assert( (circ_dst.size1() == 0 || circ_dst.size1() == viennacl::traits::size1(com_src)) && bool("Size mismatch"));
+ assert( (circ_dst.size2() == 0 || circ_dst.size2() == viennacl::traits::size2(com_src)) && bool("Size mismatch"));
+
+ vcl_size_t size = viennacl::traits::size1(com_src);
+
+ std::vector<NumericT> tmp(size);
+
+ for (vcl_size_t i = 0; i < size; i++) tmp[i] = com_src(i, 0);
+
+ copy(tmp, circ_dst);
+}
+
+/*namespace linalg
+ {
+ template<typename NumericT, unsigned int AlignmentV, unsigned int VECTOR_AlignmentV>
+ void prod_impl(circulant_matrix<NumericT, AlignmentV> const & mat,
+ vector<NumericT, VECTOR_AlignmentV> const & vec,
+ vector<NumericT, VECTOR_AlignmentV>& result) {
+ viennacl::vector<NumericT, VECTOR_AlignmentV> circ(mat.elements().size() * 2);
+ fft::real_to_complex(mat.elements(), circ, mat.elements().size());
+
+ viennacl::vector<NumericT, VECTOR_AlignmentV> tmp(vec.size() * 2);
+ viennacl::vector<NumericT, VECTOR_AlignmentV> tmp2(vec.size() * 2);
+
+ fft::real_to_complex(vec, tmp, vec.size());
+ fft::convolve(circ, tmp, tmp2);
+ fft::complex_to_real(tmp2, result, vec.size());
+ }
+ }*/
+
+/** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
+ *
+ * @param s STL output stream
+ * @param gpu_matrix A ViennaCL circulant matrix
+ */
+template<class NumericT, unsigned int AlignmentV>
+std::ostream & operator<<(std::ostream& s, circulant_matrix<NumericT, AlignmentV>& gpu_matrix)
+{
+ vcl_size_t size = gpu_matrix.size1();
+ std::vector<NumericT> tmp(size);
+ copy(gpu_matrix, tmp);
+ s << "[" << size << "," << size << "](";
+
+ for (vcl_size_t i = 0; i < size; i++)
+ {
+ s << "(";
+ for (vcl_size_t j = 0; j < size; j++)
+ {
+ long index = static_cast<long>(i) - static_cast<long>(j);
+ if (index < 0) index = static_cast<long>(size) + index;
+ s << tmp[vcl_size_t(index)];
+ //s << index;
+ if (j < (size - 1)) s << ",";
+ }
+ s << ")";
+ }
+ s << ")";
+ return s;
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+ // x = A * y
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x = A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+ lhs = temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+ lhs += temp;
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+ lhs -= temp;
+ }
+ };
+
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs());
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const circulant_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs());
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs += temp_result;
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs());
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs -= temp_result;
+ }
+ };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif // VIENNACL_CIRCULANT_MATRIX_HPP
[11/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
new file mode 100644
index 0000000..08e15a5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
@@ -0,0 +1,110 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_COMPRESSED_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_COMPRESSED_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp
+ * @brief OpenCL kernel file for vector operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_vec_mul(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void vec_mul( \n");
+ source.append(" __global const unsigned int * row_jumper, \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" uint nonzero_rows, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result, \n");
+ source.append(" "); source.append(numeric_string); source.append(" beta) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < nonzero_rows; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" unsigned int row_end = row_jumper[i+1]; \n");
+ source.append(" for (unsigned int j = row_jumper[i]; j < row_end; ++j) \n");
+ source.append(" dot_prod += elements[j] * x[column_indices[j] * layout_x.y + layout_x.x]; \n");
+
+ source.append(" if (beta != 0) result[row_indices[i] * layout_result.y + layout_result.x] += alpha * dot_prod; \n");
+ source.append(" else result[row_indices[i] * layout_result.y + layout_result.x] = alpha * dot_prod; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+/** @brief Main kernel class for generating OpenCL kernels for compressed_compressed_matrix. */
+template<typename NumericT>
+struct compressed_compressed_matrix
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_compressed_matrix";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // fully parametrized kernels:
+ generate_vec_mul(source, numeric_string);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
[18/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/sparse_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/sparse_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..9cc5d67
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/sparse_matrix_operations.hpp
@@ -0,0 +1,2081 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp
+ @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+
+#include "viennacl/linalg/host_based/spgemm_vector.hpp"
+
+#include <vector>
+
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+//
+// Compressed matrix
+//
+
+namespace detail
+{
+ template<typename NumericT, unsigned int AlignmentV>
+ void row_info(compressed_matrix<NumericT, AlignmentV> const & mat,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::detail::row_info_types info_selector)
+ {
+ NumericT * result_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+ for (vcl_size_t row = 0; row < mat.size1(); ++row)
+ {
+ NumericT value = 0;
+ unsigned int row_end = row_buffer[row+1];
+
+ switch (info_selector)
+ {
+ case viennacl::linalg::detail::SPARSE_ROW_NORM_INF: //inf-norm
+ for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+ value = std::max<NumericT>(value, std::fabs(elements[i]));
+ break;
+
+ case viennacl::linalg::detail::SPARSE_ROW_NORM_1: //1-norm
+ for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+ value += std::fabs(elements[i]);
+ break;
+
+ case viennacl::linalg::detail::SPARSE_ROW_NORM_2: //2-norm
+ for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+ value += elements[i] * elements[i];
+ value = std::sqrt(value);
+ break;
+
+ case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry
+ for (unsigned int i = row_buffer[row]; i < row_end; ++i)
+ {
+ if (col_buffer[i] == row)
+ {
+ value = elements[i];
+ break;
+ }
+ }
+ break;
+ }
+ result_buf[row] = value;
+ }
+ }
+}
+
+
+/** @brief Carries out matrix-vector multiplication with a compressed_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
+ NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(mat.size1()); ++row)
+ {
+ NumericT dot_prod = 0;
+ vcl_size_t row_end = row_buffer[row+1];
+ for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
+ dot_prod += elements[i] * vec_buf[col_buffer[i] * vec.stride() + vec.start()];
+
+ if (beta < 0 || beta > 0)
+ {
+ vcl_size_t index = static_cast<vcl_size_t>(row) * result.stride() + result.start();
+ result_buf[index] = alpha * dot_prod + beta * result_buf[index];
+ }
+ else
+ result_buf[static_cast<vcl_size_t>(row) * result.stride() + result.start()] = alpha * dot_prod;
+ }
+
+}
+
+/** @brief Carries out sparse_matrix-matrix multiplication first matrix being compressed
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat The sparse matrix
+* @param d_mat The dense matrix
+* @param result The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_base<NumericT> & d_mat,
+ viennacl::matrix_base<NumericT> & result) {
+
+ NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+ unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
+ unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+ NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+ NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+ vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+ vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+ vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
+ vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
+ vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
+ vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
+
+ vcl_size_t result_start1 = viennacl::traits::start1(result);
+ vcl_size_t result_start2 = viennacl::traits::start2(result);
+ vcl_size_t result_inc1 = viennacl::traits::stride1(result);
+ vcl_size_t result_inc2 = viennacl::traits::stride2(result);
+ vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
+ vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
+
+ detail::matrix_array_wrapper<NumericT const, row_major, false>
+ d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+ detail::matrix_array_wrapper<NumericT const, column_major, false>
+ d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+ detail::matrix_array_wrapper<NumericT, row_major, false>
+ result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+ detail::matrix_array_wrapper<NumericT, column_major, false>
+ result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+ if ( d_mat.row_major() ) {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+ vcl_size_t row_start = sp_mat_row_buffer[row];
+ vcl_size_t row_end = sp_mat_row_buffer[row+1];
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+ NumericT temp = 0;
+ for (vcl_size_t k = row_start; k < row_end; ++k) {
+ temp += sp_mat_elements[k] * d_mat_wrapper_row(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), col);
+ }
+ if (result.row_major())
+ result_wrapper_row(row, col) = temp;
+ else
+ result_wrapper_col(row, col) = temp;
+ }
+ }
+ }
+ else {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+ vcl_size_t row_start = sp_mat_row_buffer[row];
+ vcl_size_t row_end = sp_mat_row_buffer[row+1];
+ NumericT temp = 0;
+ for (vcl_size_t k = row_start; k < row_end; ++k) {
+ temp += sp_mat_elements[k] * d_mat_wrapper_col(static_cast<vcl_size_t>(sp_mat_col_buffer[k]), static_cast<vcl_size_t>(col));
+ }
+ if (result.row_major())
+ result_wrapper_row(row, col) = temp;
+ else
+ result_wrapper_col(row, col) = temp;
+ }
+ }
+ }
+
+}
+
+/** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+* and the second transposed
+*
+* Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+*
+* @param sp_mat The sparse matrix
+* @param d_mat The transposed dense matrix
+* @param result The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > & d_mat,
+ viennacl::matrix_base<NumericT> & result) {
+
+ NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+ unsigned int const * sp_mat_row_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle1());
+ unsigned int const * sp_mat_col_buffer = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+ NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+ NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+ vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+ vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+ vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
+ vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
+ vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
+ vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
+
+ vcl_size_t result_start1 = viennacl::traits::start1(result);
+ vcl_size_t result_start2 = viennacl::traits::start2(result);
+ vcl_size_t result_inc1 = viennacl::traits::stride1(result);
+ vcl_size_t result_inc2 = viennacl::traits::stride2(result);
+ vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
+ vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
+
+ detail::matrix_array_wrapper<NumericT const, row_major, false>
+ d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+ detail::matrix_array_wrapper<NumericT const, column_major, false>
+ d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+ detail::matrix_array_wrapper<NumericT, row_major, false>
+ result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+ detail::matrix_array_wrapper<NumericT, column_major, false>
+ result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+ if ( d_mat.lhs().row_major() ) {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row) {
+ vcl_size_t row_start = sp_mat_row_buffer[row];
+ vcl_size_t row_end = sp_mat_row_buffer[row+1];
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+ NumericT temp = 0;
+ for (vcl_size_t k = row_start; k < row_end; ++k) {
+ temp += sp_mat_elements[k] * d_mat_wrapper_row(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));
+ }
+ if (result.row_major())
+ result_wrapper_row(row, col) = temp;
+ else
+ result_wrapper_col(row, col) = temp;
+ }
+ }
+ }
+ else {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+ vcl_size_t row_start = sp_mat_row_buffer[row];
+ vcl_size_t row_end = sp_mat_row_buffer[row+1];
+ NumericT temp = 0;
+ for (vcl_size_t k = row_start; k < row_end; ++k) {
+ temp += sp_mat_elements[k] * d_mat_wrapper_col(col, static_cast<vcl_size_t>(sp_mat_col_buffer[k]));
+ }
+ if (result.row_major())
+ result_wrapper_row(row, col) = temp;
+ else
+ result_wrapper_col(row, col) = temp;
+ }
+ }
+ }
+
+}
+
+
+/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices
+*
+* Implementation of the convenience expression C = prod(A, B);
+* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B
+*
+* @param A Left factor
+* @param B Right factor
+* @param C Result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ viennacl::compressed_matrix<NumericT, AlignmentV> const & B,
+ viennacl::compressed_matrix<NumericT, AlignmentV> & C)
+{
+
+ NumericT const * A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ NumericT const * B_elements = detail::extract_raw_pointer<NumericT>(B.handle());
+ unsigned int const * B_row_buffer = detail::extract_raw_pointer<unsigned int>(B.handle1());
+ unsigned int const * B_col_buffer = detail::extract_raw_pointer<unsigned int>(B.handle2());
+
+ C.resize(A.size1(), B.size2(), false);
+ unsigned int * C_row_buffer = detail::extract_raw_pointer<unsigned int>(C.handle1());
+
+#if defined(VIENNACL_WITH_OPENMP)
+ unsigned int block_factor = 10;
+ unsigned int max_threads = omp_get_max_threads();
+ long chunk_size = long(A.size1()) / long(block_factor * max_threads) + 1;
+#else
+ unsigned int max_threads = 1;
+#endif
+ std::vector<unsigned int> max_length_row_C(max_threads);
+ std::vector<unsigned int *> row_C_temp_index_buffers(max_threads);
+ std::vector<NumericT *> row_C_temp_value_buffers(max_threads);
+
+
+ /*
+ * Stage 1: Determine maximum length of work buffers:
+ */
+
+#if defined(VIENNACL_WITH_OPENMP)
+ #pragma omp parallel for schedule(dynamic, chunk_size)
+#endif
+ for (long i=0; i<long(A.size1()); ++i)
+ {
+ unsigned int row_start_A = A_row_buffer[i];
+ unsigned int row_end_A = A_row_buffer[i+1];
+
+ unsigned int row_C_upper_bound_row = 0;
+ for (unsigned int j = row_start_A; j<row_end_A; ++j)
+ {
+ unsigned int row_B = A_col_buffer[j];
+
+ unsigned int entries_in_row = B_row_buffer[row_B+1] - B_row_buffer[row_B];
+ row_C_upper_bound_row += entries_in_row;
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ unsigned int thread_id = omp_get_thread_num();
+#else
+ unsigned int thread_id = 0;
+#endif
+
+ max_length_row_C[thread_id] = std::max(max_length_row_C[thread_id], std::min(row_C_upper_bound_row, static_cast<unsigned int>(B.size2())));
+ }
+
+ // determine global maximum row length
+ for (std::size_t i=1; i<max_length_row_C.size(); ++i)
+ max_length_row_C[0] = std::max(max_length_row_C[0], max_length_row_C[i]);
+
+ // allocate work vectors:
+ for (unsigned int i=0; i<max_threads; ++i)
+ row_C_temp_index_buffers[i] = (unsigned int *)malloc(sizeof(unsigned int)*3*max_length_row_C[0]);
+
+
+ /*
+ * Stage 2: Determine sparsity pattern of C
+ */
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for schedule(dynamic, chunk_size)
+#endif
+ for (long i=0; i<long(A.size1()); ++i)
+ {
+ unsigned int thread_id = 0;
+ #ifdef VIENNACL_WITH_OPENMP
+ thread_id = omp_get_thread_num();
+ #endif
+ unsigned int buffer_len = max_length_row_C[0];
+
+ unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];
+ unsigned int *row_C_vector_2 = row_C_vector_1 + buffer_len;
+ unsigned int *row_C_vector_3 = row_C_vector_2 + buffer_len;
+
+ unsigned int row_start_A = A_row_buffer[i];
+ unsigned int row_end_A = A_row_buffer[i+1];
+
+ C_row_buffer[i] = row_C_scan_symbolic_vector(row_start_A, row_end_A, A_col_buffer,
+ B_row_buffer, B_col_buffer, static_cast<unsigned int>(B.size2()),
+ row_C_vector_1, row_C_vector_2, row_C_vector_3);
+ }
+
+ // exclusive scan to obtain row start indices:
+ unsigned int current_offset = 0;
+ for (std::size_t i=0; i<C.size1(); ++i)
+ {
+ unsigned int tmp = C_row_buffer[i];
+ C_row_buffer[i] = current_offset;
+ current_offset += tmp;
+ }
+ C_row_buffer[C.size1()] = current_offset;
+ C.reserve(current_offset, false);
+
+ // allocate work vectors:
+ for (unsigned int i=0; i<max_threads; ++i)
+ row_C_temp_value_buffers[i] = (NumericT *)malloc(sizeof(NumericT)*3*max_length_row_C[0]);
+
+ /*
+ * Stage 3: Compute product (code similar, maybe pull out into a separate function to avoid code duplication?)
+ */
+ NumericT * C_elements = detail::extract_raw_pointer<NumericT>(C.handle());
+ unsigned int * C_col_buffer = detail::extract_raw_pointer<unsigned int>(C.handle2());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for schedule(dynamic, chunk_size)
+#endif
+ for (long i = 0; i < long(A.size1()); ++i)
+ {
+ unsigned int row_start_A = A_row_buffer[i];
+ unsigned int row_end_A = A_row_buffer[i+1];
+
+ unsigned int row_C_buffer_start = C_row_buffer[i];
+ unsigned int row_C_buffer_end = C_row_buffer[i+1];
+
+#ifdef VIENNACL_WITH_OPENMP
+ unsigned int thread_id = omp_get_thread_num();
+#else
+ unsigned int thread_id = 0;
+#endif
+
+ unsigned int *row_C_vector_1 = row_C_temp_index_buffers[thread_id];
+ unsigned int *row_C_vector_2 = row_C_vector_1 + max_length_row_C[0];
+ unsigned int *row_C_vector_3 = row_C_vector_2 + max_length_row_C[0];
+
+ NumericT *row_C_vector_1_values = row_C_temp_value_buffers[thread_id];
+ NumericT *row_C_vector_2_values = row_C_vector_1_values + max_length_row_C[0];
+ NumericT *row_C_vector_3_values = row_C_vector_2_values + max_length_row_C[0];
+
+ row_C_scan_numeric_vector(row_start_A, row_end_A, A_col_buffer, A_elements,
+ B_row_buffer, B_col_buffer, B_elements, static_cast<unsigned int>(B.size2()),
+ row_C_buffer_start, row_C_buffer_end, C_col_buffer, C_elements,
+ row_C_vector_1, row_C_vector_1_values,
+ row_C_vector_2, row_C_vector_2_values,
+ row_C_vector_3, row_C_vector_3_values);
+ }
+
+ // clean up at the end:
+ for (unsigned int i=0; i<max_threads; ++i)
+ {
+ free(row_C_temp_index_buffers[i]);
+ free(row_C_temp_value_buffers[i]);
+ }
+
+}
+
+
+
+
+//
+// Triangular solve for compressed_matrix, A \ b
+//
+namespace detail
+{
+ template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+ void csr_inplace_solve(IndexArrayT const & row_buffer,
+ IndexArrayT const & col_buffer,
+ ConstScalarArrayT const & element_buffer,
+ ScalarArrayT & vec_buffer,
+ vcl_size_t num_cols,
+ viennacl::linalg::unit_lower_tag)
+ {
+ vcl_size_t row_begin = row_buffer[1];
+ for (vcl_size_t row = 1; row < num_cols; ++row)
+ {
+ NumericT vec_entry = vec_buffer[row];
+ vcl_size_t row_end = row_buffer[row+1];
+ for (vcl_size_t i = row_begin; i < row_end; ++i)
+ {
+ vcl_size_t col_index = col_buffer[i];
+ if (col_index < row)
+ vec_entry -= vec_buffer[col_index] * element_buffer[i];
+ }
+ vec_buffer[row] = vec_entry;
+ row_begin = row_end;
+ }
+ }
+
+ template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+ void csr_inplace_solve(IndexArrayT const & row_buffer,
+ IndexArrayT const & col_buffer,
+ ConstScalarArrayT const & element_buffer,
+ ScalarArrayT & vec_buffer,
+ vcl_size_t num_cols,
+ viennacl::linalg::lower_tag)
+ {
+ vcl_size_t row_begin = row_buffer[0];
+ for (vcl_size_t row = 0; row < num_cols; ++row)
+ {
+ NumericT vec_entry = vec_buffer[row];
+
+ // substitute and remember diagonal entry
+ vcl_size_t row_end = row_buffer[row+1];
+ NumericT diagonal_entry = 0;
+ for (vcl_size_t i = row_begin; i < row_end; ++i)
+ {
+ vcl_size_t col_index = col_buffer[i];
+ if (col_index < row)
+ vec_entry -= vec_buffer[col_index] * element_buffer[i];
+ else if (col_index == row)
+ diagonal_entry = element_buffer[i];
+ }
+
+ vec_buffer[row] = vec_entry / diagonal_entry;
+ row_begin = row_end;
+ }
+ }
+
+
+ template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+ void csr_inplace_solve(IndexArrayT const & row_buffer,
+ IndexArrayT const & col_buffer,
+ ConstScalarArrayT const & element_buffer,
+ ScalarArrayT & vec_buffer,
+ vcl_size_t num_cols,
+ viennacl::linalg::unit_upper_tag)
+ {
+ for (vcl_size_t row2 = 1; row2 < num_cols; ++row2)
+ {
+ vcl_size_t row = (num_cols - row2) - 1;
+ NumericT vec_entry = vec_buffer[row];
+ vcl_size_t row_begin = row_buffer[row];
+ vcl_size_t row_end = row_buffer[row+1];
+ for (vcl_size_t i = row_begin; i < row_end; ++i)
+ {
+ vcl_size_t col_index = col_buffer[i];
+ if (col_index > row)
+ vec_entry -= vec_buffer[col_index] * element_buffer[i];
+ }
+ vec_buffer[row] = vec_entry;
+ }
+ }
+
+ template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+ void csr_inplace_solve(IndexArrayT const & row_buffer,
+ IndexArrayT const & col_buffer,
+ ConstScalarArrayT const & element_buffer,
+ ScalarArrayT & vec_buffer,
+ vcl_size_t num_cols,
+ viennacl::linalg::upper_tag)
+ {
+ for (vcl_size_t row2 = 0; row2 < num_cols; ++row2)
+ {
+ vcl_size_t row = (num_cols - row2) - 1;
+ NumericT vec_entry = vec_buffer[row];
+
+ // substitute and remember diagonal entry
+ vcl_size_t row_begin = row_buffer[row];
+ vcl_size_t row_end = row_buffer[row+1];
+ NumericT diagonal_entry = 0;
+ for (vcl_size_t i = row_begin; i < row_end; ++i)
+ {
+ vcl_size_t col_index = col_buffer[i];
+ if (col_index > row)
+ vec_entry -= vec_buffer[col_index] * element_buffer[i];
+ else if (col_index == row)
+ diagonal_entry = element_buffer[i];
+ }
+
+ vec_buffer[row] = vec_entry / diagonal_entry;
+ }
+ }
+
+} //namespace detail
+
+
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param L The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+* @param tag The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & L,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::unit_lower_tag tag)
+{
+ NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(L.handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+ detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
+}
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param L The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+* @param tag The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & L,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::lower_tag tag)
+{
+ NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(L.handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+ detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, L.size2(), tag);
+}
+
+
+/** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param U The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+* @param tag The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::unit_upper_tag tag)
+{
+ NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(U.handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+ detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
+}
+
+/** @brief Inplace solution of a upper triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param U The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+* @param tag The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(compressed_matrix<NumericT, AlignmentV> const & U,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::upper_tag tag)
+{
+ NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(U.handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+ detail::csr_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, U.size2(), tag);
+}
+
+
+
+
+
+
+
+//
+// Triangular solve for compressed_matrix, A^T \ b
+//
+
+namespace detail
+{
+ template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+ void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
+ IndexArrayT const & col_buffer,
+ ConstScalarArrayT const & element_buffer,
+ ScalarArrayT & vec_buffer,
+ vcl_size_t num_cols,
+ viennacl::linalg::unit_lower_tag)
+ {
+ vcl_size_t col_begin = row_buffer[0];
+ for (vcl_size_t col = 0; col < num_cols; ++col)
+ {
+ NumericT vec_entry = vec_buffer[col];
+ vcl_size_t col_end = row_buffer[col+1];
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ unsigned int row_index = col_buffer[i];
+ if (row_index > col)
+ vec_buffer[row_index] -= vec_entry * element_buffer[i];
+ }
+ col_begin = col_end;
+ }
+ }
+
+ template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+ void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
+ IndexArrayT const & col_buffer,
+ ConstScalarArrayT const & element_buffer,
+ ScalarArrayT & vec_buffer,
+ vcl_size_t num_cols,
+ viennacl::linalg::lower_tag)
+ {
+ vcl_size_t col_begin = row_buffer[0];
+ for (vcl_size_t col = 0; col < num_cols; ++col)
+ {
+ vcl_size_t col_end = row_buffer[col+1];
+
+ // Stage 1: Find diagonal entry:
+ NumericT diagonal_entry = 0;
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ vcl_size_t row_index = col_buffer[i];
+ if (row_index == col)
+ {
+ diagonal_entry = element_buffer[i];
+ break;
+ }
+ }
+
+ // Stage 2: Substitute
+ NumericT vec_entry = vec_buffer[col] / diagonal_entry;
+ vec_buffer[col] = vec_entry;
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ vcl_size_t row_index = col_buffer[i];
+ if (row_index > col)
+ vec_buffer[row_index] -= vec_entry * element_buffer[i];
+ }
+ col_begin = col_end;
+ }
+ }
+
+ template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+ void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
+ IndexArrayT const & col_buffer,
+ ConstScalarArrayT const & element_buffer,
+ ScalarArrayT & vec_buffer,
+ vcl_size_t num_cols,
+ viennacl::linalg::unit_upper_tag)
+ {
+ for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
+ {
+ vcl_size_t col = (num_cols - col2) - 1;
+
+ NumericT vec_entry = vec_buffer[col];
+ vcl_size_t col_begin = row_buffer[col];
+ vcl_size_t col_end = row_buffer[col+1];
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ vcl_size_t row_index = col_buffer[i];
+ if (row_index < col)
+ vec_buffer[row_index] -= vec_entry * element_buffer[i];
+ }
+
+ }
+ }
+
+ template<typename NumericT, typename ConstScalarArrayT, typename ScalarArrayT, typename IndexArrayT>
+ void csr_trans_inplace_solve(IndexArrayT const & row_buffer,
+ IndexArrayT const & col_buffer,
+ ConstScalarArrayT const & element_buffer,
+ ScalarArrayT & vec_buffer,
+ vcl_size_t num_cols,
+ viennacl::linalg::upper_tag)
+ {
+ for (vcl_size_t col2 = 0; col2 < num_cols; ++col2)
+ {
+ vcl_size_t col = (num_cols - col2) - 1;
+ vcl_size_t col_begin = row_buffer[col];
+ vcl_size_t col_end = row_buffer[col+1];
+
+ // Stage 1: Find diagonal entry:
+ NumericT diagonal_entry = 0;
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ vcl_size_t row_index = col_buffer[i];
+ if (row_index == col)
+ {
+ diagonal_entry = element_buffer[i];
+ break;
+ }
+ }
+
+ // Stage 2: Substitute
+ NumericT vec_entry = vec_buffer[col] / diagonal_entry;
+ vec_buffer[col] = vec_entry;
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ vcl_size_t row_index = col_buffer[i];
+ if (row_index < col)
+ vec_buffer[row_index] -= vec_entry * element_buffer[i];
+ }
+ }
+ }
+
+
+ //
+ // block solves
+ //
+ template<typename NumericT, unsigned int AlignmentV>
+ void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> & L,
+ viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
+ vector_base<NumericT> const & /* L_diagonal */, //ignored
+ vector_base<NumericT> & vec,
+ viennacl::linalg::unit_lower_tag)
+ {
+ // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
+ NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());
+ NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
+
+ vcl_size_t col_begin = row_buffer[0];
+ for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
+ {
+ NumericT vec_entry = vec_buffer[col];
+ vcl_size_t col_end = row_buffer[col+1];
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ unsigned int row_index = col_buffer[i];
+ if (row_index > col)
+ vec_buffer[row_index] -= vec_entry * elements[i];
+ }
+ col_begin = col_end;
+ }
+ }
+
+ template<typename NumericT, unsigned int AlignmentV>
+ void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> & L,
+ viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
+ vector_base<NumericT> const & L_diagonal,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::lower_tag)
+ {
+ // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.lhs().handle2());
+ NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.lhs().handle());
+ NumericT const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(L_diagonal.handle());
+ NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
+
+ vcl_size_t col_begin = row_buffer[0];
+ for (vcl_size_t col = 0; col < L.lhs().size1(); ++col)
+ {
+ vcl_size_t col_end = row_buffer[col+1];
+
+ NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];
+ vec_buffer[col] = vec_entry;
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ vcl_size_t row_index = col_buffer[i];
+ if (row_index > col)
+ vec_buffer[row_index] -= vec_entry * elements[i];
+ }
+ col_begin = col_end;
+ }
+ }
+
+
+
+ template<typename NumericT, unsigned int AlignmentV>
+ void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> & U,
+ viennacl::backend::mem_handle const & /*block_indices*/, vcl_size_t /* num_blocks */,
+ vector_base<NumericT> const & /* U_diagonal */, //ignored
+ vector_base<NumericT> & vec,
+ viennacl::linalg::unit_upper_tag)
+ {
+ // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
+ NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());
+ NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
+
+ for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
+ {
+ vcl_size_t col = (U.lhs().size1() - col2) - 1;
+
+ NumericT vec_entry = vec_buffer[col];
+ vcl_size_t col_begin = row_buffer[col];
+ vcl_size_t col_end = row_buffer[col+1];
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ vcl_size_t row_index = col_buffer[i];
+ if (row_index < col)
+ vec_buffer[row_index] -= vec_entry * elements[i];
+ }
+
+ }
+ }
+
+ template<typename NumericT, unsigned int AlignmentV>
+ void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> & U,
+ viennacl::backend::mem_handle const & /* block_indices */, vcl_size_t /* num_blocks */,
+ vector_base<NumericT> const & U_diagonal,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::upper_tag)
+ {
+ // Note: The following could be implemented more efficiently using the block structure and possibly OpenMP.
+
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.lhs().handle2());
+ NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.lhs().handle());
+ NumericT const * diagonal_buffer = detail::extract_raw_pointer<NumericT>(U_diagonal.handle());
+ NumericT * vec_buffer = detail::extract_raw_pointer<NumericT>(vec.handle());
+
+ for (vcl_size_t col2 = 0; col2 < U.lhs().size1(); ++col2)
+ {
+ vcl_size_t col = (U.lhs().size1() - col2) - 1;
+ vcl_size_t col_begin = row_buffer[col];
+ vcl_size_t col_end = row_buffer[col+1];
+
+ // Stage 2: Substitute
+ NumericT vec_entry = vec_buffer[col] / diagonal_buffer[col];
+ vec_buffer[col] = vec_entry;
+ for (vcl_size_t i = col_begin; i < col_end; ++i)
+ {
+ vcl_size_t row_index = col_buffer[i];
+ if (row_index < col)
+ vec_buffer[row_index] -= vec_entry * elements[i];
+ }
+ }
+ }
+
+
+} //namespace detail
+
+/** @brief Inplace solution of a lower triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy Proxy object for a transposed CSR-matrix
+* @param vec The right hand side vector
+* @param tag The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & proxy,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::unit_lower_tag tag)
+{
+ NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+ detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+}
+
+/** @brief Inplace solution of a lower triangular compressed_matrix. Typically used for LU substitutions
+*
+* @param proxy Proxy object for a transposed CSR-matrix
+* @param vec The right hand side vector
+* @param tag The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & proxy,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::lower_tag tag)
+{
+ NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+ detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+}
+
+
+/** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy Proxy object for a transposed CSR-matrix
+* @param vec The right hand side vector
+* @param tag The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & proxy,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::unit_upper_tag tag)
+{
+ NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+ detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+}
+
+
+/** @brief Inplace solution of a upper triangular compressed_matrix with unit diagonal. Typically used for LU substitutions
+*
+* @param proxy Proxy object for a transposed CSR-matrix
+* @param vec The right hand side vector
+* @param tag The solver tag identifying the respective triangular solver
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void inplace_solve(matrix_expression< const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> const & proxy,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::upper_tag tag)
+{
+ NumericT * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(proxy.lhs().handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(proxy.lhs().handle2());
+
+ detail::csr_trans_inplace_solve<NumericT>(row_buffer, col_buffer, elements, vec_buf, proxy.lhs().size1(), tag);
+}
+
+
+
+//
+// Compressed Compressed Matrix
+//
+
+/** @brief Carries out matrix-vector multiplication with a compressed_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT>
+void prod_impl(const viennacl::compressed_compressed_matrix<NumericT> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
+ NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle1());
+ unsigned int const * row_indices = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+ if (beta < 0 || beta > 0)
+ {
+ for (vcl_size_t i = 0; i< result.size(); ++i)
+ result_buf[i * result.stride() + result.start()] *= beta;
+ }
+ else // flush
+ {
+ for (vcl_size_t i = 0; i< result.size(); ++i)
+ result_buf[i * result.stride() + result.start()] = 0;
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i = 0; i < static_cast<long>(mat.nnz1()); ++i)
+ {
+ NumericT dot_prod = 0;
+ vcl_size_t row_end = row_buffer[i+1];
+ for (vcl_size_t j = row_buffer[i]; j < row_end; ++j)
+ dot_prod += elements[j] * vec_buf[col_buffer[j] * vec.stride() + vec.start()];
+
+ if (beta > 0 || beta < 0)
+ result_buf[vcl_size_t(row_indices[i]) * result.stride() + result.start()] += alpha * dot_prod;
+ else
+ result_buf[vcl_size_t(row_indices[i]) * result.stride() + result.start()] = alpha * dot_prod;
+ }
+
+}
+
+
+
+//
+// Coordinate Matrix
+//
+
+namespace detail
+{
+ template<typename NumericT, unsigned int AlignmentV>
+ void row_info(coordinate_matrix<NumericT, AlignmentV> const & mat,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::detail::row_info_types info_selector)
+ {
+ NumericT * result_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
+
+ NumericT value = 0;
+ unsigned int last_row = 0;
+
+ for (vcl_size_t i = 0; i < mat.nnz(); ++i)
+ {
+ unsigned int current_row = coord_buffer[2*i];
+
+ if (current_row != last_row)
+ {
+ if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
+ value = std::sqrt(value);
+
+ result_buf[last_row] = value;
+ value = 0;
+ last_row = current_row;
+ }
+
+ switch (info_selector)
+ {
+ case viennacl::linalg::detail::SPARSE_ROW_NORM_INF: //inf-norm
+ value = std::max<NumericT>(value, std::fabs(elements[i]));
+ break;
+
+ case viennacl::linalg::detail::SPARSE_ROW_NORM_1: //1-norm
+ value += std::fabs(elements[i]);
+ break;
+
+ case viennacl::linalg::detail::SPARSE_ROW_NORM_2: //2-norm
+ value += elements[i] * elements[i];
+ break;
+
+ case viennacl::linalg::detail::SPARSE_ROW_DIAGONAL: //diagonal entry
+ if (coord_buffer[2*i+1] == current_row)
+ value = elements[i];
+ break;
+
+ //default:
+ // break;
+ }
+ }
+
+ if (info_selector == viennacl::linalg::detail::SPARSE_ROW_NORM_2)
+ value = std::sqrt(value);
+
+ result_buf[last_row] = value;
+ }
+}
+
+/** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
+ NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle12());
+
+ if (beta < 0 || beta > 0)
+ {
+ for (vcl_size_t i = 0; i< result.size(); ++i)
+ result_buf[i * result.stride() + result.start()] *= beta;
+ }
+ else // flush
+ {
+ for (vcl_size_t i = 0; i< result.size(); ++i)
+ result_buf[i * result.stride() + result.start()] = 0;
+ }
+
+ for (vcl_size_t i = 0; i < mat.nnz(); ++i)
+ result_buf[coord_buffer[2*i] * result.stride() + result.start()]
+ += alpha * elements[i] * vec_buf[coord_buffer[2*i+1] * vec.stride() + vec.start()];
+}
+
+/** @brief Carries out Compressed Matrix(COO)-Dense Matrix multiplication
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat The Sparse Matrix (Coordinate format)
+* @param d_mat The Dense Matrix
+* @param result The Result Matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_base<NumericT> & d_mat,
+ viennacl::matrix_base<NumericT> & result) {
+
+ NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+ unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
+
+ NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+ NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+ vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+ vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+ vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
+ vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
+ vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
+ vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
+
+ vcl_size_t result_start1 = viennacl::traits::start1(result);
+ vcl_size_t result_start2 = viennacl::traits::start2(result);
+ vcl_size_t result_inc1 = viennacl::traits::stride1(result);
+ vcl_size_t result_inc2 = viennacl::traits::stride2(result);
+ vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
+ vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
+
+ detail::matrix_array_wrapper<NumericT const, row_major, false>
+ d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+ detail::matrix_array_wrapper<NumericT const, column_major, false>
+ d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+ detail::matrix_array_wrapper<NumericT, row_major, false>
+ result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+ detail::matrix_array_wrapper<NumericT, column_major, false>
+ result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+ if ( d_mat.row_major() ) {
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+ {
+ if (result.row_major())
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_row(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ else
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_col(row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
+ NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+ vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
+ vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+ NumericT y = d_mat_wrapper_row( c, col);
+ if (result.row_major())
+ result_wrapper_row(r, col) += x * y;
+ else
+ result_wrapper_col(r, col) += x * y;
+ }
+ }
+ }
+
+ else {
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+ {
+ if (result.row_major())
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+ result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ else
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+ result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+
+ for (vcl_size_t i = 0; i < sp_mat.nnz(); ++i) {
+
+ NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+ vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
+ vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
+ NumericT y = d_mat_wrapper_col( c, col);
+
+ if (result.row_major())
+ result_wrapper_row( r, col) += x*y;
+ else
+ result_wrapper_col( r, col) += x*y;
+ }
+
+ }
+ }
+
+}
+
+
+/** @brief Carries out Compressed Matrix(COO)-Dense Transposed Matrix multiplication
+*
+* Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+*
+* @param sp_mat The Sparse Matrix (Coordinate format)
+* @param d_mat The Dense Transposed Matrix
+* @param result The Result Matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > & d_mat,
+ viennacl::matrix_base<NumericT> & result) {
+
+ NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+ unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle12());
+
+ NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+ NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+ vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+ vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+ vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
+ vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
+ vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
+ vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
+
+ vcl_size_t result_start1 = viennacl::traits::start1(result);
+ vcl_size_t result_start2 = viennacl::traits::start2(result);
+ vcl_size_t result_inc1 = viennacl::traits::stride1(result);
+ vcl_size_t result_inc2 = viennacl::traits::stride2(result);
+ vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
+ vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
+
+ detail::matrix_array_wrapper<NumericT const, row_major, false>
+ d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+ detail::matrix_array_wrapper<NumericT const, column_major, false>
+ d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+ detail::matrix_array_wrapper<NumericT, row_major, false>
+ result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+ detail::matrix_array_wrapper<NumericT, column_major, false>
+ result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+ if ( d_mat.lhs().row_major() )
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+ {
+ if (result.row_major())
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ else
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
+ NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+ vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
+ vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
+ if (result.row_major())
+ {
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+ NumericT y = d_mat_wrapper_row( col, c);
+ result_wrapper_row(r, col) += x * y;
+ }
+ }
+ else
+ {
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+ NumericT y = d_mat_wrapper_row( col, c);
+ result_wrapper_col(r, col) += x * y;
+ }
+ }
+ }
+
+
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+ {
+ if (result.row_major())
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+ result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ else
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+ result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i = 0; i < static_cast<long>(sp_mat.nnz()); ++i) {
+ NumericT x = static_cast<NumericT>(sp_mat_elements[i]);
+ vcl_size_t r = static_cast<vcl_size_t>(sp_mat_coords[2*i]);
+ vcl_size_t c = static_cast<vcl_size_t>(sp_mat_coords[2*i+1]);
+ if (result.row_major())
+ {
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+ NumericT y = d_mat_wrapper_col( col, c);
+ result_wrapper_row(r, col) += x * y;
+ }
+ }
+ else
+ {
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+ NumericT y = d_mat_wrapper_col( col, c);
+ result_wrapper_col(r, col) += x * y;
+ }
+ }
+ }
+ }
+
+}
+
+
+
+//
+// ELL Matrix
+//
+/** @brief Carries out matrix-vector multiplication with a ell_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
+ NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+
+ for (vcl_size_t row = 0; row < mat.size1(); ++row)
+ {
+ NumericT sum = 0;
+
+ for (unsigned int item_id = 0; item_id < mat.internal_maxnnz(); ++item_id)
+ {
+ vcl_size_t offset = row + item_id * mat.internal_size1();
+ NumericT val = elements[offset];
+
+ if (val > 0 || val < 0)
+ {
+ unsigned int col = coords[offset];
+ sum += (vec_buf[col * vec.stride() + vec.start()] * val);
+ }
+ }
+
+ if (beta < 0 || beta > 0)
+ {
+ vcl_size_t index = row * result.stride() + result.start();
+ result_buf[index] = alpha * sum + beta * result_buf[index];
+ }
+ else
+ result_buf[row * result.stride() + result.start()] = alpha * sum;
+ }
+}
+
+/** @brief Carries out ell_matrix-d_matrix multiplication
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat The sparse(ELL) matrix
+* @param d_mat The dense matrix
+* @param result The result dense matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_base<NumericT> & d_mat,
+ viennacl::matrix_base<NumericT> & result)
+{
+ NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+ unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+ NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+ NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+ vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+ vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+ vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
+ vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
+ vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
+ vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
+
+ vcl_size_t result_start1 = viennacl::traits::start1(result);
+ vcl_size_t result_start2 = viennacl::traits::start2(result);
+ vcl_size_t result_inc1 = viennacl::traits::stride1(result);
+ vcl_size_t result_inc2 = viennacl::traits::stride2(result);
+ vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
+ vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
+
+ detail::matrix_array_wrapper<NumericT const, row_major, false>
+ d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+ detail::matrix_array_wrapper<NumericT const, column_major, false>
+ d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+ detail::matrix_array_wrapper<NumericT, row_major, false>
+ result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+ detail::matrix_array_wrapper<NumericT, column_major, false>
+ result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+ if ( d_mat.row_major() ) {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+ {
+ if (result.row_major())
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ else
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+ {
+ for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id)
+ {
+ vcl_size_t offset = static_cast<vcl_size_t>(row) + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();
+ NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+ vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
+
+ if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
+ {
+ if (result.row_major())
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_row(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);
+ else
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_col(static_cast<vcl_size_t>(row), col) += sp_mat_val * d_mat_wrapper_row( sp_mat_col, col);
+ }
+ }
+ }
+ }
+ else {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+ {
+ if (result.row_major())
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+ result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ else
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+ result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col) {
+
+ for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
+
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+ vcl_size_t offset = row + item_id * sp_mat.internal_size1();
+ NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+ vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
+
+ if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
+ {
+ if (result.row_major())
+ result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);
+ else
+ result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( sp_mat_col, col);
+ }
+ }
+ }
+ }
+ }
+
+}
+
+/** @brief Carries out matrix-trans(matrix) multiplication first matrix being sparse ell
+* and the second dense transposed
+*
+* Implementation of the convenience expression result = prod(sp_mat, trans(d_mat));
+*
+* @param sp_mat The sparse matrix
+* @param d_mat The transposed dense matrix
+* @param result The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::ell_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > & d_mat,
+ viennacl::matrix_base<NumericT> & result) {
+
+ NumericT const * sp_mat_elements = detail::extract_raw_pointer<NumericT>(sp_mat.handle());
+ unsigned int const * sp_mat_coords = detail::extract_raw_pointer<unsigned int>(sp_mat.handle2());
+
+ NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat.lhs());
+ NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+ vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+ vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+ vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
+ vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
+ vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
+ vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
+
+ vcl_size_t result_start1 = viennacl::traits::start1(result);
+ vcl_size_t result_start2 = viennacl::traits::start2(result);
+ vcl_size_t result_inc1 = viennacl::traits::stride1(result);
+ vcl_size_t result_inc2 = viennacl::traits::stride2(result);
+ vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
+ vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
+
+ detail::matrix_array_wrapper<NumericT const, row_major, false>
+ d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+ detail::matrix_array_wrapper<NumericT const, column_major, false>
+ d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+ detail::matrix_array_wrapper<NumericT, row_major, false>
+ result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+ detail::matrix_array_wrapper<NumericT, column_major, false>
+ result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+ if ( d_mat.lhs().row_major() )
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long row = 0; row < static_cast<long>(sp_mat.size1()); ++row)
+ {
+ if (result.row_major())
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ else
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ }
+
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col) {
+
+ for (unsigned int item_id = 0; item_id < sp_mat.maxnnz(); ++item_id) {
+
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+ vcl_size_t offset = row + item_id * sp_mat.internal_size1();
+ NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+ vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
+
+ if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
+ {
+ if (result.row_major())
+ result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);
+ else
+ result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_row( col, sp_mat_col);
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long col = 0; col < static_cast<long>(d_mat.size2()); ++col)
+ {
+ if (result.row_major())
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+ result_wrapper_row( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ else
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row)
+ result_wrapper_col( row, col) = (NumericT)0; /* filling result with zeros, as the product loops are reordered */
+ }
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (vcl_size_t row = 0; row < sp_mat.size1(); ++row) {
+
+ for (long item_id = 0; item_id < static_cast<long>(sp_mat.maxnnz()); ++item_id) {
+
+ vcl_size_t offset = row + static_cast<vcl_size_t>(item_id) * sp_mat.internal_size1();
+ NumericT sp_mat_val = static_cast<NumericT>(sp_mat_elements[offset]);
+ vcl_size_t sp_mat_col = static_cast<vcl_size_t>(sp_mat_coords[offset]);
+
+ if (sp_mat_val < 0 || sp_mat_val > 0) // sp_mat_val != 0 without compiler warnings
+ {
+ if (result.row_major())
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_row( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);
+ else
+ for (vcl_size_t col = 0; col < d_mat.size2(); ++col)
+ result_wrapper_col( row, col) += sp_mat_val * d_mat_wrapper_col( col, sp_mat_col);
+ }
+ }
+ }
+ }
+
+}
+
+
+//
+// SELL-C-\sigma Matrix
+//
+/** @brief Carries out matrix-vector multiplication with a sliced_ell_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT, typename IndexT>
+void prod_impl(const viennacl::sliced_ell_matrix<NumericT, IndexT> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
+ NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ IndexT const * columns_per_block = detail::extract_raw_pointer<IndexT>(mat.handle1());
+ IndexT const * column_indices = detail::extract_raw_pointer<IndexT>(mat.handle2());
+ IndexT const * block_start = detail::extract_raw_pointer<IndexT>(mat.handle3());
+
+ vcl_size_t num_blocks = mat.size1() / mat.rows_per_block() + 1;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long block_idx2 = 0; block_idx2 < static_cast<long>(num_blocks); ++block_idx2)
+ {
+ vcl_size_t block_idx = static_cast<vcl_size_t>(block_idx2);
+ vcl_size_t current_columns_per_block = columns_per_block[block_idx];
+
+ std::vector<NumericT> result_values(mat.rows_per_block());
+
+ for (IndexT column_entry_index = 0;
+ column_entry_index < current_columns_per_block;
+ ++column_entry_index)
+ {
+ vcl_size_t stride_start = block_start[block_idx] + column_entry_index * mat.rows_per_block();
+ // Note: This for-loop may be unrolled by hand for exploiting vectorization
+ // Careful benchmarking recommended first, memory channels may be saturated already!
+ for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
+ {
+ NumericT val = elements[stride_start + row_in_block];
+
+ result_values[row_in_block] += (val > 0 || val < 0) ? vec_buf[column_indices[stride_start + row_in_block] * vec.stride() + vec.start()] * val : 0;
+ }
+ }
+
+ vcl_size_t first_row_in_matrix = block_idx * mat.rows_per_block();
+ if (beta < 0 || beta > 0)
+ {
+ for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
+ {
+ if (first_row_in_matrix + row_in_block < result.size())
+ {
+ vcl_size_t index = (first_row_in_matrix + row_in_block) * result.stride() + result.start();
+ result_buf[index] = alpha * result_values[row_in_block] + beta * result_buf[index];
+ }
+ }
+ }
+ else
+ {
+ for (IndexT row_in_block = 0; row_in_block < mat.rows_per_block(); ++row_in_block)
+ {
+ if (first_row_in_matrix + row_in_block < result.size())
+ result_buf[(first_row_in_matrix + row_in_block) * result.stride() + result.start()] = alpha * result_values[row_in_block];
+ }
+ }
+ }
+}
+
+
+//
+// Hybrid Matrix
+//
+/** @brief Carries out matrix-vector multiplication with a hyb_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ NumericT * result_buf = detail::extract_raw_pointer<NumericT>(result.handle());
+ NumericT const * vec_buf = detail::extract_raw_pointer<NumericT>(vec.handle());
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+ NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
+ unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+ unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+ for (vcl_size_t row = 0; row < mat.size1(); ++row)
+ {
+ NumericT sum = 0;
+
+ //
+ // Part 1: Process ELL part
+ //
+ for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+ {
+ vcl_size_t offset = row + item_id * mat.internal_size1();
+ NumericT val = elements[offset];
+
+ if (val > 0 || val < 0)
+ {
+ unsigned int col = coords[offset];
+ sum += (vec_buf[col * vec.stride() + vec.start()] * val);
+ }
+ }
+
+ //
+ // Part 2: Process HYB part
+ //
+ vcl_size_t col_begin = csr_row_buffer[row];
+ vcl_size_t col_end = csr_row_buffer[row + 1];
+
+ for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+ {
+ sum += (vec_buf[csr_col_buffer[item_id] * vec.stride() + vec.start()] * csr_elements[item_id]);
+ }
+
+ if (beta < 0 || beta > 0)
+ {
+ vcl_size_t index = row * result.stride() + result.start();
+ result_buf[index] = alpha * sum + beta * result_buf[index];
+ }
+ else
+ result_buf[row * result.stride() + result.start()] = alpha * sum;
+ }
+
+}
+
+//
+// Hybrid Matrix
+//
+/** @brief Carries out sparse-matrix-dense-matrix multiplication with a hyb_matrix
+*
+* Implementation of the convenience expression C = prod(A, B);
+*
+* @param mat The sparse matrix A
+* @param d_mat The dense matrix B
+* @param result The dense result matrix C
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,
+ const viennacl::matrix_base<NumericT> & d_mat,
+ viennacl::matrix_base<NumericT> & result)
+{
+ NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+ NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+ vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat);
+ vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat);
+ vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat);
+ vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat);
+ vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat);
+ vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat);
+
+ vcl_size_t result_start1 = viennacl::traits::start1(result);
+ vcl_size_t result_start2 = viennacl::traits::start2(result);
+ vcl_size_t result_inc1 = viennacl::traits::stride1(result);
+ vcl_size_t result_inc2 = viennacl::traits::stride2(result);
+ vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
+ vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
+
+ detail::matrix_array_wrapper<NumericT const, row_major, false>
+ d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+ detail::matrix_array_wrapper<NumericT const, column_major, false>
+ d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+ detail::matrix_array_wrapper<NumericT, row_major, false>
+ result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+ detail::matrix_array_wrapper<NumericT, column_major, false>
+ result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+ NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
+ unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+ unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+ for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
+ {
+ for (vcl_size_t row = 0; row < mat.size1(); ++row)
+ {
+ NumericT sum = 0;
+
+ //
+ // Part 1: Process ELL part
+ //
+ for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+ {
+ vcl_size_t offset = row + item_id * mat.internal_size1();
+ NumericT val = elements[offset];
+
+ if (val < 0 || val > 0) // val != 0 without compiler warnings
+ {
+ vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);
+ if (d_mat.row_major())
+ sum += d_mat_wrapper_row(col, result_col) * val;
+ else
+ sum += d_mat_wrapper_col(col, result_col) * val;
+ }
+ }
+
+ //
+ // Part 2: Process HYB/CSR part
+ //
+ vcl_size_t col_begin = csr_row_buffer[row];
+ vcl_size_t col_end = csr_row_buffer[row + 1];
+
+ if (d_mat.row_major())
+ for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+ sum += d_mat_wrapper_row(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];
+ else
+ for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+ sum += d_mat_wrapper_col(static_cast<vcl_size_t>(csr_col_buffer[item_id]), result_col) * csr_elements[item_id];
+
+ if (result.row_major())
+ result_wrapper_row(row, result_col) = sum;
+ else
+ result_wrapper_col(row, result_col) = sum;
+ }
+ } // for result_col
+}
+
+
+/** @brief Carries out sparse-matrix-transposed-dense-matrix multiplication with a hyb_matrix
+*
+* Implementation of the convenience expression C = prod(A, trans(B));
+*
+* @param mat The sparse matrix A
+* @param d_mat The dense matrix B
+* @param result The dense result matrix C
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::hyb_matrix<NumericT, AlignmentV> & mat,
+ const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > & d_mat,
+ viennacl::matrix_base<NumericT> & result)
+{
+ NumericT const * d_mat_data = detail::extract_raw_pointer<NumericT>(d_mat);
+ NumericT * result_data = detail::extract_raw_pointer<NumericT>(result);
+
+ vcl_size_t d_mat_start1 = viennacl::traits::start1(d_mat.lhs());
+ vcl_size_t d_mat_start2 = viennacl::traits::start2(d_mat.lhs());
+ vcl_size_t d_mat_inc1 = viennacl::traits::stride1(d_mat.lhs());
+ vcl_size_t d_mat_inc2 = viennacl::traits::stride2(d_mat.lhs());
+ vcl_size_t d_mat_internal_size1 = viennacl::traits::internal_size1(d_mat.lhs());
+ vcl_size_t d_mat_internal_size2 = viennacl::traits::internal_size2(d_mat.lhs());
+
+ vcl_size_t result_start1 = viennacl::traits::start1(result);
+ vcl_size_t result_start2 = viennacl::traits::start2(result);
+ vcl_size_t result_inc1 = viennacl::traits::stride1(result);
+ vcl_size_t result_inc2 = viennacl::traits::stride2(result);
+ vcl_size_t result_internal_size1 = viennacl::traits::internal_size1(result);
+ vcl_size_t result_internal_size2 = viennacl::traits::internal_size2(result);
+
+ detail::matrix_array_wrapper<NumericT const, row_major, false>
+ d_mat_wrapper_row(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+ detail::matrix_array_wrapper<NumericT const, column_major, false>
+ d_mat_wrapper_col(d_mat_data, d_mat_start1, d_mat_start2, d_mat_inc1, d_mat_inc2, d_mat_internal_size1, d_mat_internal_size2);
+
+ detail::matrix_array_wrapper<NumericT, row_major, false>
+ result_wrapper_row(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+ detail::matrix_array_wrapper<NumericT, column_major, false>
+ result_wrapper_col(result_data, result_start1, result_start2, result_inc1, result_inc2, result_internal_size1, result_internal_size2);
+
+ NumericT const * elements = detail::extract_raw_pointer<NumericT>(mat.handle());
+ unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(mat.handle2());
+ NumericT const * csr_elements = detail::extract_raw_pointer<NumericT>(mat.handle5());
+ unsigned int const * csr_row_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle3());
+ unsigned int const * csr_col_buffer = detail::extract_raw_pointer<unsigned int>(mat.handle4());
+
+
+ for (vcl_size_t result_col = 0; result_col < result.size2(); ++result_col)
+ {
+ for (vcl_size_t row = 0; row < mat.size1(); ++row)
+ {
+ NumericT sum = 0;
+
+ //
+ // Part 1: Process ELL part
+ //
+ for (unsigned int item_id = 0; item_id < mat.internal_ellnnz(); ++item_id)
+ {
+ vcl_size_t offset = row + item_id * mat.internal_size1();
+ NumericT val = elements[offset];
+
+ if (val < 0 || val > 0) // val != 0 without compiler warnings
+ {
+ vcl_size_t col = static_cast<vcl_size_t>(coords[offset]);
+ if (d_mat.lhs().row_major())
+ sum += d_mat_wrapper_row(result_col, col) * val;
+ else
+ sum += d_mat_wrapper_col(result_col, col) * val;
+ }
+ }
+
+ //
+ // Part 2: Process HYB/CSR part
+ //
+ vcl_size_t col_begin = csr_row_buffer[row];
+ vcl_size_t col_end = csr_row_buffer[row + 1];
+
+ if (d_mat.lhs().row_major())
<TRUNCATED>
[51/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
(nojira) add native-viennaCL module to codebase. closes apache/mahout#241
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/f7c1f802
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/f7c1f802
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/f7c1f802
Branch: refs/heads/master
Commit: f7c1f8026296bae1eab9768564a871ea1dd6583b
Parents: 1fca074
Author: Andrew Palumbo <ap...@apache.org>
Authored: Wed Jun 8 17:37:01 2016 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Wed Jun 8 17:39:19 2016 -0400
----------------------------------------------------------------------
LICENSE.txt | 29 +
math-scala/pom.xml | 5 +
.../scalabindings/viennacl/ViennaCLTests.scala | 40 +
native-viennaCL/pom.xml | 249 ++
native-viennaCL/src/main/cpp/HelloNative.h | 10 +
.../src/main/cpp/libviennacl/CMakeLists.txt | 40 +
.../main/cpp/libviennacl/include/viennacl.hpp | 617 +++
.../src/main/cpp/libviennacl/src/backend.cpp | 46 +
.../src/main/cpp/libviennacl/src/backend.cu | 46 +
.../src/main/cpp/libviennacl/src/blas1.cpp | 420 ++
.../src/main/cpp/libviennacl/src/blas1.cu | 420 ++
.../src/main/cpp/libviennacl/src/blas1_cuda.cu | 264 ++
.../src/main/cpp/libviennacl/src/blas1_host.cpp | 293 ++
.../src/main/cpp/libviennacl/src/blas1_host.cu | 293 ++
.../main/cpp/libviennacl/src/blas1_opencl.cpp | 297 ++
.../main/cpp/libviennacl/src/blas1_opencl.cu | 297 ++
.../src/main/cpp/libviennacl/src/blas2.cpp | 231 +
.../src/main/cpp/libviennacl/src/blas2.cu | 231 +
.../src/main/cpp/libviennacl/src/blas2_cuda.cu | 204 +
.../src/main/cpp/libviennacl/src/blas2_host.cpp | 219 +
.../src/main/cpp/libviennacl/src/blas2_host.cu | 219 +
.../main/cpp/libviennacl/src/blas2_opencl.cpp | 219 +
.../main/cpp/libviennacl/src/blas2_opencl.cu | 219 +
.../src/main/cpp/libviennacl/src/blas3.cpp | 272 ++
.../src/main/cpp/libviennacl/src/blas3.cu | 272 ++
.../src/main/cpp/libviennacl/src/blas3.hpp | 60 +
.../src/main/cpp/libviennacl/src/blas3_cuda.cu | 133 +
.../src/main/cpp/libviennacl/src/blas3_host.cpp | 131 +
.../src/main/cpp/libviennacl/src/blas3_host.cu | 131 +
.../main/cpp/libviennacl/src/blas3_opencl.cpp | 136 +
.../main/cpp/libviennacl/src/blas3_opencl.cu | 136 +
.../main/cpp/libviennacl/src/init_matrix.hpp | 101 +
.../main/cpp/libviennacl/src/init_vector.hpp | 101 +
.../cpp/libviennacl/src/viennacl_private.hpp | 141 +
.../src/main/cpp/viennacl/backend/cpu_ram.hpp | 171 +
.../src/main/cpp/viennacl/backend/cuda.hpp | 206 +
.../main/cpp/viennacl/backend/mem_handle.hpp | 250 ++
.../src/main/cpp/viennacl/backend/memory.hpp | 628 +++
.../src/main/cpp/viennacl/backend/opencl.hpp | 151 +
.../src/main/cpp/viennacl/backend/util.hpp | 268 ++
.../src/main/cpp/viennacl/circulant_matrix.hpp | 359 ++
.../viennacl/compressed_compressed_matrix.hpp | 619 +++
.../src/main/cpp/viennacl/compressed_matrix.hpp | 1178 ++++++
.../src/main/cpp/viennacl/context.hpp | 88 +
.../src/main/cpp/viennacl/coordinate_matrix.hpp | 506 +++
.../src/main/cpp/viennacl/detail/matrix_def.hpp | 270 ++
.../src/main/cpp/viennacl/detail/vector_def.hpp | 349 ++
.../device_specific/builtin_database/common.hpp | 219 +
.../devices/accelerator/fallback.hpp | 85 +
.../builtin_database/devices/cpu/fallback.hpp | 84 +
.../devices/gpu/amd/evergreen/cedar.hpp | 64 +
.../devices/gpu/amd/evergreen/cypress.hpp | 65 +
.../devices/gpu/amd/northern_islands/barts.hpp | 64 +
.../gpu/amd/northern_islands/devastator.hpp | 64 +
.../gpu/amd/northern_islands/scrapper.hpp | 64 +
.../devices/gpu/amd/southern_islands/tahiti.hpp | 84 +
.../devices/gpu/amd/volcanic_islands/hawaii.hpp | 84 +
.../builtin_database/devices/gpu/fallback.hpp | 84 +
.../gpu/nvidia/fermi/geforce_gt_540m.hpp | 59 +
.../gpu/nvidia/fermi/geforce_gtx_470.hpp | 83 +
.../gpu/nvidia/fermi/geforce_gtx_580.hpp | 84 +
.../devices/gpu/nvidia/fermi/tesla_c2050.hpp | 84 +
.../devices/gpu/nvidia/kepler/tesla_k20m.hpp | 84 +
.../gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp | 85 +
.../gpu/nvidia/tesla/geforce_gtx_260.hpp | 84 +
.../builtin_database/matrix_product.hpp | 244 ++
.../cpp/viennacl/device_specific/execute.hpp | 55 +
.../device_specific/execution_handler.hpp | 102 +
.../cpp/viennacl/device_specific/forwards.h | 294 ++
.../device_specific/lazy_program_compiler.hpp | 74 +
.../viennacl/device_specific/mapped_objects.hpp | 512 +++
.../templates/matrix_product_template.hpp | 859 ++++
.../device_specific/templates/template_base.hpp | 596 +++
.../device_specific/templates/utils.hpp | 105 +
.../viennacl/device_specific/tree_parsing.hpp | 512 +++
.../main/cpp/viennacl/device_specific/utils.hpp | 568 +++
.../src/main/cpp/viennacl/ell_matrix.hpp | 362 ++
native-viennaCL/src/main/cpp/viennacl/fft.hpp | 282 ++
.../src/main/cpp/viennacl/forwards.h | 1032 +++++
.../src/main/cpp/viennacl/hankel_matrix.hpp | 343 ++
.../src/main/cpp/viennacl/hyb_matrix.hpp | 442 ++
.../src/main/cpp/viennacl/io/matrix_market.hpp | 440 ++
.../src/main/cpp/viennacl/linalg/amg.hpp | 398 ++
.../main/cpp/viennacl/linalg/amg_operations.hpp | 238 ++
.../src/main/cpp/viennacl/linalg/bicgstab.hpp | 598 +++
.../src/main/cpp/viennacl/linalg/bisect.hpp | 179 +
.../src/main/cpp/viennacl/linalg/bisect_gpu.hpp | 173 +
.../src/main/cpp/viennacl/linalg/cg.hpp | 440 ++
.../linalg/circulant_matrix_operations.hpp | 75 +
.../cpp/viennacl/linalg/cuda/amg_operations.hpp | 821 ++++
.../linalg/cuda/bisect_kernel_calls.hpp | 166 +
.../linalg/cuda/bisect_kernel_large.hpp | 928 ++++
.../linalg/cuda/bisect_kernel_large_multi.hpp | 277 ++
.../linalg/cuda/bisect_kernel_large_onei.hpp | 180 +
.../linalg/cuda/bisect_kernel_small.hpp | 261 ++
.../cpp/viennacl/linalg/cuda/bisect_util.hpp | 613 +++
.../main/cpp/viennacl/linalg/cuda/common.hpp | 250 ++
.../cpp/viennacl/linalg/cuda/direct_solve.hpp | 412 ++
.../cpp/viennacl/linalg/cuda/fft_operations.hpp | 858 ++++
.../cpp/viennacl/linalg/cuda/ilu_operations.hpp | 666 +++
.../linalg/cuda/iterative_operations.hpp | 2049 +++++++++
.../viennacl/linalg/cuda/matrix_operations.hpp | 2725 ++++++++++++
.../linalg/cuda/matrix_operations_col.hpp | 1847 ++++++++
.../linalg/cuda/matrix_operations_prod.hpp | 2887 +++++++++++++
.../linalg/cuda/matrix_operations_row.hpp | 1468 +++++++
.../viennacl/linalg/cuda/misc_operations.hpp | 91 +
.../cpp/viennacl/linalg/cuda/nmf_operations.hpp | 152 +
.../viennacl/linalg/cuda/scalar_operations.hpp | 375 ++
.../linalg/cuda/sparse_matrix_operations.hpp | 2809 +++++++++++++
.../cuda/sparse_matrix_operations_solve.hpp | 761 ++++
.../main/cpp/viennacl/linalg/cuda/spgemm.hpp | 793 ++++
.../cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp | 669 +++
.../viennacl/linalg/cuda/vector_operations.hpp | 3252 ++++++++++++++
.../cpp/viennacl/linalg/detail/amg/amg_base.hpp | 208 +
.../detail/bisect/bisect_kernel_calls.hpp | 191 +
.../linalg/detail/bisect/bisect_large.hpp | 142 +
.../linalg/detail/bisect/bisect_small.hpp | 96 +
.../viennacl/linalg/detail/bisect/config.hpp | 44 +
.../linalg/detail/bisect/gerschgorin.hpp | 94 +
.../viennacl/linalg/detail/bisect/structs.hpp | 182 +
.../cpp/viennacl/linalg/detail/bisect/util.hpp | 106 +
.../viennacl/linalg/detail/ilu/block_ilu.hpp | 617 +++
.../linalg/detail/ilu/chow_patel_ilu.hpp | 316 ++
.../cpp/viennacl/linalg/detail/ilu/common.hpp | 263 ++
.../cpp/viennacl/linalg/detail/ilu/ilu0.hpp | 379 ++
.../cpp/viennacl/linalg/detail/ilu/ilut.hpp | 597 +++
.../cpp/viennacl/linalg/detail/op_applier.hpp | 103 +
.../cpp/viennacl/linalg/detail/op_executor.hpp | 86 +
.../linalg/detail/spai/block_matrix.hpp | 86 +
.../linalg/detail/spai/block_vector.hpp | 77 +
.../cpp/viennacl/linalg/detail/spai/fspai.hpp | 402 ++
.../main/cpp/viennacl/linalg/detail/spai/qr.hpp | 497 +++
.../linalg/detail/spai/small_matrix.hpp | 113 +
.../linalg/detail/spai/spai-dynamic.hpp | 687 +++
.../viennacl/linalg/detail/spai/spai-static.hpp | 192 +
.../cpp/viennacl/linalg/detail/spai/spai.hpp | 832 ++++
.../viennacl/linalg/detail/spai/spai_tag.hpp | 143 +
.../linalg/detail/spai/sparse_vector.hpp | 85 +
.../main/cpp/viennacl/linalg/direct_solve.hpp | 580 +++
.../src/main/cpp/viennacl/linalg/eig.hpp | 29 +
.../main/cpp/viennacl/linalg/fft_operations.hpp | 481 +++
.../src/main/cpp/viennacl/linalg/gmres.hpp | 738 ++++
.../linalg/hankel_matrix_operations.hpp | 66 +
.../linalg/host_based/amg_operations.hpp | 1123 +++++
.../cpp/viennacl/linalg/host_based/common.hpp | 149 +
.../viennacl/linalg/host_based/direct_solve.hpp | 307 ++
.../linalg/host_based/fft_operations.hpp | 856 ++++
.../linalg/host_based/ilu_operations.hpp | 672 +++
.../linalg/host_based/iterative_operations.hpp | 880 ++++
.../linalg/host_based/matrix_operations.hpp | 2052 +++++++++
.../linalg/host_based/misc_operations.hpp | 80 +
.../linalg/host_based/nmf_operations.hpp | 247 ++
.../linalg/host_based/scalar_operations.hpp | 162 +
.../host_based/sparse_matrix_operations.hpp | 2081 +++++++++
.../linalg/host_based/spgemm_vector.hpp | 705 ++++
.../linalg/host_based/vector_operations.hpp | 1188 ++++++
.../src/main/cpp/viennacl/linalg/ichol.hpp | 228 +
.../src/main/cpp/viennacl/linalg/ilu.hpp | 33 +
.../main/cpp/viennacl/linalg/ilu_operations.hpp | 334 ++
.../src/main/cpp/viennacl/linalg/inner_prod.hpp | 186 +
.../viennacl/linalg/iterative_operations.hpp | 425 ++
.../main/cpp/viennacl/linalg/jacobi_precond.hpp | 141 +
.../src/main/cpp/viennacl/linalg/lanczos.hpp | 515 +++
.../src/main/cpp/viennacl/linalg/lu.hpp | 227 +
.../cpp/viennacl/linalg/matrix_operations.hpp | 1303 ++++++
.../src/main/cpp/viennacl/linalg/maxmin.hpp | 152 +
.../cpp/viennacl/linalg/misc_operations.hpp | 94 +
.../cpp/viennacl/linalg/mixed_precision_cg.hpp | 199 +
.../src/main/cpp/viennacl/linalg/nmf.hpp | 91 +
.../src/main/cpp/viennacl/linalg/norm_1.hpp | 104 +
.../src/main/cpp/viennacl/linalg/norm_2.hpp | 140 +
.../main/cpp/viennacl/linalg/norm_frobenius.hpp | 73 +
.../src/main/cpp/viennacl/linalg/norm_inf.hpp | 108 +
.../viennacl/linalg/opencl/amg_operations.hpp | 458 ++
.../linalg/opencl/bisect_kernel_calls.hpp | 177 +
.../main/cpp/viennacl/linalg/opencl/common.hpp | 102 +
.../cpp/viennacl/linalg/opencl/direct_solve.hpp | 153 +
.../viennacl/linalg/opencl/fft_operations.hpp | 350 ++
.../viennacl/linalg/opencl/ilu_operations.hpp | 260 ++
.../linalg/opencl/iterative_operations.hpp | 945 +++++
.../cpp/viennacl/linalg/opencl/kernels/amg.hpp | 393 ++
.../viennacl/linalg/opencl/kernels/bisect.hpp | 2645 ++++++++++++
.../kernels/compressed_compressed_matrix.hpp | 110 +
.../linalg/opencl/kernels/compressed_matrix.hpp | 1703 ++++++++
.../linalg/opencl/kernels/coordinate_matrix.hpp | 405 ++
.../linalg/opencl/kernels/ell_matrix.hpp | 221 +
.../cpp/viennacl/linalg/opencl/kernels/fft.hpp | 311 ++
.../linalg/opencl/kernels/hyb_matrix.hpp | 240 ++
.../cpp/viennacl/linalg/opencl/kernels/ilu.hpp | 505 +++
.../linalg/opencl/kernels/iterative.hpp | 1619 +++++++
.../viennacl/linalg/opencl/kernels/matrix.hpp | 1193 ++++++
.../linalg/opencl/kernels/matrix_element.hpp | 138 +
.../linalg/opencl/kernels/matrix_solve.hpp | 180 +
.../cpp/viennacl/linalg/opencl/kernels/nmf.hpp | 99 +
.../viennacl/linalg/opencl/kernels/scalar.hpp | 283 ++
.../cpp/viennacl/linalg/opencl/kernels/scan.hpp | 194 +
.../linalg/opencl/kernels/sliced_ell_matrix.hpp | 135 +
.../cpp/viennacl/linalg/opencl/kernels/spai.hpp | 631 +++
.../cpp/viennacl/linalg/opencl/kernels/svd.hpp | 703 ++++
.../viennacl/linalg/opencl/kernels/vector.hpp | 867 ++++
.../linalg/opencl/kernels/vector_element.hpp | 163 +
.../linalg/opencl/matrix_operations.hpp | 1019 +++++
.../viennacl/linalg/opencl/misc_operations.hpp | 69 +
.../viennacl/linalg/opencl/nmf_operations.hpp | 139 +
.../linalg/opencl/scalar_operations.hpp | 205 +
.../linalg/opencl/sparse_matrix_operations.hpp | 1244 ++++++
.../opencl/vandermonde_matrix_operations.hpp | 68 +
.../linalg/opencl/vector_operations.hpp | 1263 ++++++
.../src/main/cpp/viennacl/linalg/power_iter.hpp | 129 +
.../src/main/cpp/viennacl/linalg/prod.hpp | 370 ++
.../cpp/viennacl/linalg/qr-method-common.hpp | 188 +
.../src/main/cpp/viennacl/linalg/qr-method.hpp | 833 ++++
.../src/main/cpp/viennacl/linalg/qr.hpp | 669 +++
.../main/cpp/viennacl/linalg/row_scaling.hpp | 204 +
.../cpp/viennacl/linalg/scalar_operations.hpp | 242 ++
.../src/main/cpp/viennacl/linalg/spai.hpp | 292 ++
.../linalg/sparse_matrix_operations.hpp | 421 ++
.../src/main/cpp/viennacl/linalg/sum.hpp | 136 +
.../src/main/cpp/viennacl/linalg/svd.hpp | 533 +++
.../linalg/toeplitz_matrix_operations.hpp | 78 +
.../src/main/cpp/viennacl/linalg/tql2.hpp | 262 ++
.../linalg/vandermonde_matrix_operations.hpp | 71 +
.../cpp/viennacl/linalg/vector_operations.hpp | 1304 ++++++
.../src/main/cpp/viennacl/matrix.hpp | 3968 ++++++++++++++++++
.../src/main/cpp/viennacl/matrix_proxy.hpp | 595 +++
.../src/main/cpp/viennacl/meta/enable_if.hpp | 43 +
.../src/main/cpp/viennacl/meta/predicate.hpp | 534 +++
.../src/main/cpp/viennacl/meta/result_of.hpp | 631 +++
.../src/main/cpp/viennacl/meta/tag_of.hpp | 390 ++
.../cpp/viennacl/misc/bandwidth_reduction.hpp | 38 +
.../main/cpp/viennacl/misc/cuthill_mckee.hpp | 611 +++
.../viennacl/misc/gibbs_poole_stockmeyer.hpp | 426 ++
.../src/main/cpp/viennacl/ocl/backend.hpp | 358 ++
.../src/main/cpp/viennacl/ocl/command_queue.hpp | 92 +
.../src/main/cpp/viennacl/ocl/context.hpp | 820 ++++
.../src/main/cpp/viennacl/ocl/device.hpp | 1496 +++++++
.../src/main/cpp/viennacl/ocl/device_utils.hpp | 191 +
.../src/main/cpp/viennacl/ocl/enqueue.hpp | 148 +
.../src/main/cpp/viennacl/ocl/error.hpp | 687 +++
.../src/main/cpp/viennacl/ocl/forwards.h | 63 +
.../src/main/cpp/viennacl/ocl/handle.hpp | 228 +
.../src/main/cpp/viennacl/ocl/kernel.hpp | 805 ++++
.../src/main/cpp/viennacl/ocl/local_mem.hpp | 51 +
.../src/main/cpp/viennacl/ocl/platform.hpp | 145 +
.../src/main/cpp/viennacl/ocl/program.hpp | 86 +
.../src/main/cpp/viennacl/ocl/utils.hpp | 85 +
native-viennaCL/src/main/cpp/viennacl/range.hpp | 76 +
.../src/main/cpp/viennacl/scalar.hpp | 874 ++++
.../src/main/cpp/viennacl/scheduler/execute.hpp | 290 ++
.../cpp/viennacl/scheduler/execute_axbx.hpp | 381 ++
.../viennacl/scheduler/execute_elementwise.hpp | 410 ++
.../scheduler/execute_generic_dispatcher.hpp | 134 +
.../scheduler/execute_matrix_dispatcher.hpp | 185 +
.../viennacl/scheduler/execute_matrix_prod.hpp | 406 ++
.../scheduler/execute_scalar_assign.hpp | 192 +
.../scheduler/execute_scalar_dispatcher.hpp | 131 +
.../cpp/viennacl/scheduler/execute_util.hpp | 324 ++
.../scheduler/execute_vector_dispatcher.hpp | 200 +
.../src/main/cpp/viennacl/scheduler/forwards.h | 830 ++++
.../src/main/cpp/viennacl/scheduler/io.hpp | 268 ++
.../src/main/cpp/viennacl/scheduler/preset.hpp | 105 +
native-viennaCL/src/main/cpp/viennacl/slice.hpp | 77 +
.../src/main/cpp/viennacl/sliced_ell_matrix.hpp | 383 ++
.../src/main/cpp/viennacl/toeplitz_matrix.hpp | 384 ++
.../src/main/cpp/viennacl/tools/adapter.hpp | 421 ++
.../src/main/cpp/viennacl/tools/entry_proxy.hpp | 234 ++
.../cpp/viennacl/tools/matrix_generation.hpp | 146 +
.../cpp/viennacl/tools/matrix_size_deducer.hpp | 204 +
.../src/main/cpp/viennacl/tools/random.hpp | 84 +
.../src/main/cpp/viennacl/tools/sha1.hpp | 237 ++
.../src/main/cpp/viennacl/tools/shared_ptr.hpp | 179 +
.../src/main/cpp/viennacl/tools/timer.hpp | 123 +
.../src/main/cpp/viennacl/tools/tools.hpp | 315 ++
.../src/main/cpp/viennacl/traits/clear.hpp | 62 +
.../src/main/cpp/viennacl/traits/context.hpp | 65 +
.../src/main/cpp/viennacl/traits/fill.hpp | 68 +
.../src/main/cpp/viennacl/traits/handle.hpp | 270 ++
.../src/main/cpp/viennacl/traits/row_major.hpp | 50 +
.../src/main/cpp/viennacl/traits/size.hpp | 442 ++
.../src/main/cpp/viennacl/traits/start.hpp | 102 +
.../src/main/cpp/viennacl/traits/stride.hpp | 72 +
.../main/cpp/viennacl/vandermonde_matrix.hpp | 342 ++
.../src/main/cpp/viennacl/vector.hpp | 3235 ++++++++++++++
.../src/main/cpp/viennacl/vector_proxy.hpp | 340 ++
.../src/main/cpp/viennacl/version.hpp | 31 +
.../apache/mahout/javaCppTest/HelloNative.scala | 28 +
.../mahout/javaCppTest/ViennaMatrix.scala | 34 +
.../apache/mahout/javaCppTest/ViennaSvd.scala | 8 +
.../mahout/javaCppTest/ViennaVector.scala | 8 +
.../javaCppTest/HelloNativeTestSuite.scala | 15 +
pom.xml | 1 +
runtests.sh | 1 +
292 files changed, 120789 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/LICENSE.txt
----------------------------------------------------------------------
diff --git a/LICENSE.txt b/LICENSE.txt
index 8ce7fff..dcc3c4e 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -799,3 +799,32 @@ The following license applies to the H2O package
identification within third-party archives.
Copyright 2012 0xdata, Inc
+
+================================================================
+The following applies to the ViennaCL library and files in the native-ViennaCL module
+================================================================
+
+ Copyright (c) 2010-2016 Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing, TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+ Argonne National Laboratory, with facilities in the state of Illinois,
+ is owned by The United States Government, and operated by UChicago Argonne, LLC
+ under provision of a contract with the Department of Energy.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/math-scala/pom.xml
----------------------------------------------------------------------
diff --git a/math-scala/pom.xml b/math-scala/pom.xml
index 280a7a1..de96b03 100644
--- a/math-scala/pom.xml
+++ b/math-scala/pom.xml
@@ -122,6 +122,11 @@
<artifactId>mahout-math</artifactId>
</dependency>
+ <!--<dependency>-->
+ <!--<groupId>org.apache.mahout</groupId>-->
+ <!--<artifactId>mahout-native-viennacl_2.10</artifactId>-->
+ <!--</dependency>-->
+
<dependency>
<groupId>com.esotericsoftware.kryo</groupId>
<artifactId>kryo</artifactId>
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/viennacl/ViennaCLTests.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/viennacl/ViennaCLTests.scala b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/viennacl/ViennaCLTests.scala
new file mode 100644
index 0000000..aa7466e
--- /dev/null
+++ b/math-scala/src/test/scala/org/apache/mahout/math/scalabindings/viennacl/ViennaCLTests.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.math.scalabindings.viennacl
+
+
+import org.apache.mahout.logging._
+import org.apache.mahout.math._
+import org.apache.mahout.math.scalabindings.RLikeOps._
+import org.apache.mahout.test.MahoutSuite
+import org.scalatest.FunSuite
+//import org.apache.mahout.javaCppTest._
+
+import scala.math._
+
+class ViennaCLTests extends FunSuite with MahoutSuite {
+
+ private final implicit val log = getLog(classOf[ViennaCLTests])
+
+// test("HelloNative"){
+// val nTest = new HelloNative
+//
+// nTest.set_property("Hello Native")
+// assert(nTest.get_property() == "Hello Native")
+// }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/pom.xml
----------------------------------------------------------------------
diff --git a/native-viennaCL/pom.xml b/native-viennaCL/pom.xml
new file mode 100644
index 0000000..a144adc
--- /dev/null
+++ b/native-viennaCL/pom.xml
@@ -0,0 +1,249 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout</artifactId>
+ <version>0.12.2-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <!--<artifactId>mahout-native-viennacl_${scala.compat.version}</artifactId>-->
+ <artifactId>mahout-native-viennacl_2.10</artifactId>
+
+ <name>Mahout Native VienniaCL Bindings</name>
+ <description>Native Structures and interfaces to be used from Mahout math-scala.
+ </description>
+
+ <packaging>jar</packaging>
+
+ <build>
+ <plugins>
+ <!-- create test jar so other modules can reuse the native test utility classes. -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ <phase>package</phase>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-javadoc-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <artifactId>maven-source-plugin</artifactId>
+ </plugin>
+
+ <plugin>
+ <groupId>net.alchim31.maven</groupId>
+ <artifactId>scala-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>add-scala-sources</id>
+ <phase>initialize</phase>
+ <goals>
+ <goal>add-source</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>scala-compile</id>
+ <phase>process-resources</phase>
+ <goals>
+ <goal>compile</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>scala-test-compile</id>
+ <phase>process-test-resources</phase>
+ <goals>
+ <goal>testCompile</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!--this is what scalatest recommends to do to enable scala tests -->
+
+ <!-- disable surefire -->
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <skipTests>true</skipTests>
+ </configuration>
+ </plugin>
+ <!-- enable scalatest -->
+ <plugin>
+ <groupId>org.scalatest</groupId>
+ <artifactId>scalatest-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>test</id>
+ <goals>
+ <goal>test</goal>
+ </goals>
+ </execution>
+
+ </executions>
+ </plugin>
+
+
+
+ <!--JavaCPP native build plugin-->
+ <!-- old-style way to get it to compile. -->
+ <!--based on https://github.com/bytedeco/javacpp/wiki/Maven-->
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <version>1.2.1</version>
+ <executions>
+ <execution>
+ <id>javacpp</id>
+ <phase>process-classes</phase>
+ <goals>
+ <goal>exec</goal>
+ </goals>
+ <configuration>
+ <executable>java</executable>
+ <arguments>
+ <argument>-jar</argument>
+ <argument>${org.bytedeco:javacpp:jar}</argument>
+ <argument>-classpath</argument>
+ <argument>${project.build.outputDirectory}</argument>
+ <argument>-Xcompiler</argument>
+ <argument>-I${basedir}/src/main/cpp</argument>
+ <!--<argument>-I${basedir}/src/main/cpp/viennacl</argument>-->
+ <!--<argument>-I${basedir}/src/main/cpp/viennacl/linalg</argument>-->
+ <!--<argument>-I${basedir}/src/main/cpp/libviennacl/include</argument>-->
+ <!--<argument>-I${basedir}/src/main/cpp/libviennacl/include/src</argument>-->
+ </arguments>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <version>2.3</version>
+ <executions>
+ <execution>
+ <goals>
+ <goal>properties</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <version>1.2.1</version>
+ </plugin>
+
+ </plugins>
+
+ </build>
+
+ <dependencies>
+
+ <!-- 3rd-party -->
+ <dependency>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ </dependency>
+
+ <!-- scala stuff -->
+ <dependency>
+ <groupId>org.scalatest</groupId>
+ <artifactId>scalatest_${scala.compat.version}</artifactId>
+ </dependency>
+
+
+ <dependency>
+ <groupId>org.bytedeco</groupId>
+ <artifactId>javacpp</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+
+ </dependencies>
+
+
+ <profiles>
+ <profile>
+ <id>mahout-release</id>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>net.alchim31.maven</groupId>
+ <artifactId>scala-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>generate-scaladoc</id>
+ <goals>
+ <goal>doc</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>attach-scaladoc-jar</id>
+ <goals>
+ <goal>doc-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ <profile>
+ <id>travis</id>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <!-- Limit memory for unit tests in Travis -->
+ <argLine>-Xmx3g</argLine>
+ <!--<argLine>-Djava.library.path=${project.build.directory}/libs/natives/linux-x86_64:${project.build.directory}/libs/natives/linux:${project.build.directory}/libs/natives/maxosx</argLine>-->
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <configuration>
+ <!-- Limit memory for integration tests in Travis -->
+ <argLine>-Xmx3g</argLine>
+ <!--<argLine>-Djava.library.path=${project.build.directory}/libs/natives/linux-x86_64:${project.build.directory}/libs/natives/linux:${project.build.directory}/libs/natives/maxosx</argLine>-->
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
+</project>
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/HelloNative.h
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/HelloNative.h b/native-viennaCL/src/main/cpp/HelloNative.h
new file mode 100644
index 0000000..d101f94
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/HelloNative.h
@@ -0,0 +1,10 @@
+#include <string>
+
+namespace HelloNative {
+ class HelloNative {
+ public:
+ const std::string& get_property() { return property; }
+ void set_property(const std::string& property) { this->property = property; }
+ std::string property;
+ };
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/CMakeLists.txt b/native-viennaCL/src/main/cpp/libviennacl/CMakeLists.txt
new file mode 100644
index 0000000..d2f83e5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/CMakeLists.txt
@@ -0,0 +1,40 @@
+
+include_directories(${PROJECT_SOURCE_DIR}/libviennacl/include/)
+
+if(ENABLE_CUDA)
+
+ if(ENABLE_OPENCL)
+ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-DVIENNACL_WITH_OPENCL") #set flags before setting executable!
+ cuda_add_library(viennacl SHARED src/backend.cu
+ src/blas1.cu src/blas1_host.cu src/blas1_cuda.cu src/blas1_opencl.cu
+ src/blas2.cu src/blas2_host.cu src/blas2_cuda.cu src/blas2_opencl.cu
+ src/blas3.cu src/blas3_host.cu src/blas3_cuda.cu src/blas3_opencl.cu)
+ set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -DVIENNACL_WITH_CUDA")
+ target_link_libraries(viennacl ${OPENCL_LIBRARIES})
+ else(ENABLE_OPENCL)
+ cuda_add_library(viennacl SHARED src/backend.cu
+ src/blas1.cu src/blas1_host.cu src/blas1_cuda.cu
+ src/blas2.cu src/blas2_host.cu src/blas2_cuda.cu
+ src/blas3.cu src/blas3_host.cu src/blas3_cuda.cu)
+ set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_CUDA")
+ endif(ENABLE_OPENCL)
+else(ENABLE_CUDA)
+ if(ENABLE_OPENCL)
+ add_library(viennacl SHARED src/backend.cpp
+ src/blas1.cpp src/blas1_host.cpp src/blas1_opencl.cpp
+ src/blas2.cpp src/blas2_host.cpp src/blas2_opencl.cpp
+ src/blas3.cpp src/blas3_host.cpp src/blas3_opencl.cpp)
+ set_target_properties(viennacl PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL")
+ target_link_libraries(viennacl ${OPENCL_LIBRARIES})
+ else(ENABLE_OPENCL)
+ add_library(viennacl SHARED src/backend.cpp
+ src/blas1.cpp src/blas1_host.cpp
+ src/blas2.cpp src/blas2_host.cpp
+ src/blas3.cpp src/blas3_host.cpp)
+ endif(ENABLE_OPENCL)
+endif(ENABLE_CUDA)
+
+# Special linkage for OpenMP under MinGW:
+if(ENABLE_OPENMP AND MINGW)
+ target_link_libraries(viennacl gomp)
+endif(ENABLE_OPENMP AND MINGW)
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/include/viennacl.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/include/viennacl.hpp b/native-viennaCL/src/main/cpp/libviennacl/include/viennacl.hpp
new file mode 100644
index 0000000..0b7b97a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/include/viennacl.hpp
@@ -0,0 +1,617 @@
+#ifndef VIENNACL_VIENNACL_HPP
+#define VIENNACL_VIENNACL_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <stdlib.h>
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+
+// Extra export declarations when building with Visual Studio:
+#if defined(_MSC_VER)
+ #if defined(viennacl_EXPORTS)
+ #define VIENNACL_EXPORTED_FUNCTION __declspec(dllexport)
+ #else
+ #define VIENNACL_EXPORTED_FUNCTION __declspec(dllimport)
+ #endif /* viennacl_EXPORTS */
+#else /* defined (_MSC_VER) */
+ #define VIENNACL_EXPORTED_FUNCTION
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int ViennaCLInt;
+
+
+/************** Enums ***************/
+
+typedef enum
+{
+ ViennaCLInvalidBackend, // for catching uninitialized and invalid values
+ ViennaCLCUDA,
+ ViennaCLOpenCL,
+ ViennaCLHost
+} ViennaCLBackendTypes;
+
+typedef enum
+{
+ ViennaCLInvalidOrder, // for catching uninitialized and invalid values
+ ViennaCLRowMajor,
+ ViennaCLColumnMajor
+} ViennaCLOrder;
+
+typedef enum
+{
+ ViennaCLInvalidTranspose, // for catching uninitialized and invalid values
+ ViennaCLNoTrans,
+ ViennaCLTrans
+} ViennaCLTranspose;
+
+typedef enum
+{
+ ViennaCLInvalidUplo, // for catching uninitialized and invalid values
+ ViennaCLUpper,
+ ViennaCLLower
+} ViennaCLUplo;
+
+typedef enum
+{
+ ViennaCLInvalidDiag, // for catching uninitialized and invalid values
+ ViennaCLUnit,
+ ViennaCLNonUnit
+} ViennaCLDiag;
+
+typedef enum
+{
+ ViennaCLInvalidPrecision, // for catching uninitialized and invalid values
+ ViennaCLFloat,
+ ViennaCLDouble
+} ViennaCLPrecision;
+
+// Error codes:
+typedef enum
+{
+ ViennaCLSuccess = 0,
+ ViennaCLGenericFailure
+} ViennaCLStatus;
+
+
+/************* Backend Management ******************/
+
+/** @brief Generic backend for CUDA, OpenCL, host-based stuff */
+struct ViennaCLBackend_impl;
+typedef ViennaCLBackend_impl* ViennaCLBackend;
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend);
+
+/******** User Types **********/
+
+struct ViennaCLHostScalar_impl;
+typedef ViennaCLHostScalar_impl* ViennaCLHostScalar;
+
+struct ViennaCLScalar_impl;
+typedef ViennaCLScalar_impl* ViennaCLScalar;
+
+struct ViennaCLVector_impl;
+typedef ViennaCLVector_impl* ViennaCLVector;
+
+struct ViennaCLMatrix_impl;
+typedef ViennaCLMatrix_impl* ViennaCLMatrix;
+
+
+/******************** BLAS Level 1 ***********************/
+
+// IxASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiSamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiDamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASasum(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADasum(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAScopy(ViennaCLBackend backend, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADcopy(ViennaCLBackend backend, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend backend, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend backend, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASdot(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADdot(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector x, ViennaCLVector y,
+ ViennaCLHostScalar c, ViennaCLHostScalar s);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASrot(ViennaCLBackend backend, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy,
+ float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADrot(ViennaCLBackend backend, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy,
+ double c, double s);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ double c, double s);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend backend, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy,
+ float c, float s);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend backend, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy,
+ double c, double s);
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASscal(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADscal(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASswap(ViennaCLBackend backend, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADswap(ViennaCLBackend backend, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend backend, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend backend, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+
+
+/******************** BLAS Level 2 ***********************/
+
+// xGEMV: y <- alpha * Ax + beta * y
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemv(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLVector x, ViennaCLHostScalar beta, ViennaCLVector y);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float beta,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double beta,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ float beta,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ double beta,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, float alpha, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float beta,
+ float *y, ViennaCLInt offy, ViennaCLInt incy);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, double alpha, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double beta,
+ double *y, ViennaCLInt offy, ViennaCLInt incy);
+
+// xTRSV: Ax <- x
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAStrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADtrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostStrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *x, ViennaCLInt offx, ViennaCLInt incx);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDtrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *x, ViennaCLInt offx, ViennaCLInt incx);
+
+
+// xGER: A <- alpha * x * y + A
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLger(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y, ViennaCLMatrix A);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda);
+
+
+
+/******************** BLAS Level 3 ***********************/
+
+// xGEMM: C <- alpha * AB + beta * C
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C);
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ float alpha,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ float beta,
+ float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ double alpha,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ double beta,
+ double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+
+#ifdef VIENNACL_WITH_OPENCL
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ float alpha,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ float beta,
+ cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ double alpha,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ double beta,
+ cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+#endif
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ float alpha,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ float beta,
+ float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ double alpha,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ double beta,
+ double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc);
+
+// xTRSM: Triangular solves with multiple right hand sides
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/backend.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/backend.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/backend.cpp
new file mode 100644
index 0000000..c9f6bf4
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/backend.cpp
@@ -0,0 +1,46 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend)
+{
+ *backend = new ViennaCLBackend_impl();
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id)
+{
+ backend->opencl_backend.context_id = context_id;
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend)
+{
+ delete *backend;
+ *backend = NULL;
+
+ return ViennaCLSuccess;
+}
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/backend.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/backend.cu b/native-viennaCL/src/main/cpp/libviennacl/src/backend.cu
new file mode 100644
index 0000000..c9f6bf4
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/backend.cu
@@ -0,0 +1,46 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendCreate(ViennaCLBackend * backend)
+{
+ *backend = new ViennaCLBackend_impl();
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendSetOpenCLContextID(ViennaCLBackend backend, ViennaCLInt context_id)
+{
+ backend->opencl_backend.context_id = context_id;
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLBackendDestroy(ViennaCLBackend * backend)
+{
+ delete *backend;
+ *backend = NULL;
+
+ return ViennaCLSuccess;
+}
+
[08/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/iterative.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/iterative.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/iterative.hpp
new file mode 100644
index 0000000..d24a641
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/iterative.hpp
@@ -0,0 +1,1619 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/vector_proxy.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/io.hpp"
+#include "viennacl/scheduler/preset.hpp"
+
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/iterative.hpp
+ * @brief OpenCL kernel file for specialized iterative solver kernels */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_pipelined_cg_vector_update(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void cg_vector_update( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * r, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
+ source.append(" "); source.append(numeric_string); source.append(" beta, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_r = r[i]; \n");
+ source.append(" \n");
+ source.append(" result[i] += alpha * value_p; \n");
+ source.append(" value_r -= alpha * Ap[i]; \n");
+ source.append(" value_p = value_r + beta * value_p; \n");
+ source.append(" \n");
+ source.append(" p[i] = value_p; \n");
+ source.append(" r[i] = value_r; \n");
+ source.append(" inner_prod_contrib += value_r * value_r; \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) \n ");
+ source.append(" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
+
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_pipelined_cg_blocked_prod(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
+{
+ std::stringstream ss;
+ ss << subwarp_size;
+
+ source.append("__kernel void cg_csr_blocked_prod( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+ source.append("{ \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+
+ source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
+ source.append(" const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
+ source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
+ source.append(" const unsigned int block_stop = min(block_start + block_increment, size); \n");
+
+ source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
+ source.append(" row < block_stop; \n");
+ source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" unsigned int row_end = row_indices[row+1]; \n");
+ source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
+ source.append(" dot_prod += elements[i] * p[column_indices[i]]; \n");
+
+ source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
+ source.append(" #pragma unroll \n");
+ source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
+ source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
+
+ source.append(" if (id_in_row == 0) { \n");
+ source.append(" Ap[row] = shared_elements[get_local_id(0)]; \n");
+ source.append(" inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
+ source.append(" inner_prod_pAp += p[row] * shared_elements[get_local_id(0)]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ ////////// parallel reduction in work group
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" } \n");
+
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void cg_csr_prod( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const unsigned int * row_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int num_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_elements) \n");
+ source.append("{ \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+
+ source.append(" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
+ source.append(" unsigned int row_start = row_blocks[block_id]; \n");
+ source.append(" unsigned int row_stop = row_blocks[block_id + 1]; \n");
+ source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
+ source.append(" unsigned int element_start = row_indices[row_start]; \n");
+ source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
+
+ source.append(" if (rows_to_process > 1) { \n"); // CSR stream
+ // load to shared buffer:
+ source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+ source.append(" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // use one thread per row to sum:
+ source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
+ source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
+ source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
+ source.append(" dot_prod += shared_elements[i]; \n");
+ source.append(" Ap[row] = dot_prod; \n");
+ source.append(" inner_prod_ApAp += dot_prod * dot_prod; \n");
+ source.append(" inner_prod_pAp += p[row] * dot_prod; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" else \n"); // CSR vector for a single row
+ source.append(" { \n");
+ // load and sum to shared buffer:
+ source.append(" shared_elements[get_local_id(0)] = 0; \n");
+ source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+ source.append(" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
+
+ // reduction to obtain final result
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
+ source.append(" } \n");
+
+ source.append(" if (get_local_id(0) == 0) { \n");
+ source.append(" Ap[row_start] = shared_elements[0]; \n");
+ source.append(" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
+ source.append(" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" } \n");
+
+ source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_coordinate_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void cg_coo_prod( \n");
+ source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const uint * group_boundaries, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __local unsigned int * shared_rows, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+
+ ///////////// Sparse matrix-vector multiplication part /////////////
+ source.append(" uint2 tmp; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val; \n");
+ source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
+ source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
+ source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+ source.append(" uint local_index = 0; \n");
+
+ source.append(" for (uint k = 0; k < k_end; ++k) { \n");
+ source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+ source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+ source.append(" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
+
+ //check for carry from previous loop run:
+ source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
+ source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+ source.append(" val += inter_results[get_local_size(0)-1]; \n");
+ source.append(" else {\n");
+ source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
+ source.append(" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
+ source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+ source.append(" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ //segmented parallel reduction begin
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
+ source.append(" inter_results[get_local_id(0)] = val; \n");
+ source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+ source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" inter_results[get_local_id(0)] += left; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ //segmented parallel reduction end
+
+ source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
+ source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
+ source.append(" Ap[tmp.x] = Ap_entry; \n");
+ source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+ source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n"); //for k
+
+ source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
+ source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
+ source.append(" Ap[tmp.x] = Ap_entry; \n");
+ source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+ source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
+ source.append(" } \n");
+
+ //////////// parallel reduction of inner product contributions within work group ///////////////
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" } \n");
+
+ source.append("} \n \n");
+
+}
+
+
+template<typename StringT>
+void generate_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void cg_ell_prod( \n");
+ source.append(" __global const unsigned int * coords, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int internal_row_num, \n");
+ source.append(" unsigned int items_per_row, \n");
+ source.append(" unsigned int aligned_items_per_row, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint offset = row; \n");
+ source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
+ source.append(" sum += (val != 0) ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
+ source.append(" } \n");
+
+ source.append(" Ap[row] = sum; \n");
+ source.append(" inner_prod_ApAp += sum * sum; \n");
+ source.append(" inner_prod_pAp += p[row] * sum; \n");
+ source.append(" } \n");
+
+ //////////// parallel reduction of inner product contributions within work group ///////////////
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" } \n");
+ source.append("} \n \n");
+}
+
+template<typename StringT>
+void generate_sliced_ell_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void cg_sliced_ell_prod( \n");
+ source.append(" __global const unsigned int * columns_per_block, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const unsigned int * block_start, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int block_size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
+ source.append(" uint id_in_block = get_local_id(0) % block_size; \n");
+ source.append(" uint num_blocks = (size - 1) / block_size + 1; \n");
+ source.append(" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
+ source.append(" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
+
+ source.append(" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint row = block_idx * block_size + id_in_block; \n");
+ source.append(" uint offset = block_start[block_idx]; \n");
+ source.append(" uint num_columns = columns_per_block[block_idx]; \n");
+ source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
+ source.append(" uint index = offset + item_id * block_size + id_in_block; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
+ source.append(" sum += (val != 0) ? (p[column_indices[index]] * val) : 0; \n");
+ source.append(" } \n");
+
+ source.append(" if (row < size) {\n");
+ source.append(" Ap[row] = sum; \n");
+ source.append(" inner_prod_ApAp += sum * sum; \n");
+ source.append(" inner_prod_pAp += p[row] * sum; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ //////////// parallel reduction of inner product contributions within work group ///////////////
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" } \n");
+ source.append("} \n \n");
+}
+
+template<typename StringT>
+void generate_hyb_matrix_pipelined_cg_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void cg_hyb_prod( \n");
+ source.append(" const __global int* ell_coords, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+ source.append(" const __global uint* csr_rows, \n");
+ source.append(" const __global uint* csr_cols, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+ source.append(" unsigned int internal_row_num, \n");
+ source.append(" unsigned int items_per_row, \n");
+ source.append(" unsigned int aligned_items_per_row, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint offset = row; \n");
+ source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+ source.append(" sum += (val != 0) ? (p[ell_coords[offset]] * val) : 0; \n");
+ source.append(" } \n");
+
+ source.append(" uint col_begin = csr_rows[row]; \n");
+ source.append(" uint col_end = csr_rows[row + 1]; \n");
+
+ source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
+ source.append(" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
+ source.append(" } \n");
+
+ source.append(" Ap[row] = sum; \n");
+ source.append(" inner_prod_ApAp += sum * sum; \n");
+ source.append(" inner_prod_pAp += p[row] * sum; \n");
+ source.append(" } \n");
+
+ //////////// parallel reduction of inner product contributions within work group ///////////////
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" } \n");
+ source.append("} \n \n");
+}
+
+
+//////////////////////////////////////////////////////
+
+
+template<typename StringT>
+void generate_pipelined_bicgstab_update_s(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bicgstab_update_s( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * s, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * r, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int chunk_size, \n");
+ source.append(" unsigned int chunk_offset, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_Ap_in_r0) \n");
+ source.append("{ \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" alpha = 0; \n");
+
+ // parallel reduction in work group to compute <r, r0> / <Ap, r0>
+ source.append(" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0)]; \n");
+ source.append(" shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // compute alpha from reduced values:
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
+
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
+ source.append(" \n");
+ source.append(" value_s = r[i] - alpha * Ap[i]; \n");
+ source.append(" inner_prod_contrib += value_s * value_s; \n");
+ source.append(" \n");
+ source.append(" s[i] = value_s; \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) \n ");
+ source.append(" inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
+
+ source.append("} \n");
+
+}
+
+
+
+template<typename StringT>
+void generate_pipelined_bicgstab_vector_update(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bicgstab_vector_update( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" "); source.append(numeric_string); source.append(" omega, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * s, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * residual, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * As, \n");
+ source.append(" "); source.append(numeric_string); source.append(" beta, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * Ap, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * r0star, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_r_r0star = 0; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_result = result[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_p = p[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_s = s[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_residual = residual[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_As = As[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_Ap = Ap[i]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_r0star = r0star[i]; \n");
+ source.append(" \n");
+ source.append(" value_result += alpha * value_p + omega * value_s; \n");
+ source.append(" value_residual = value_s - omega * value_As; \n");
+ source.append(" value_p = value_residual + beta * (value_p - omega * value_Ap); \n");
+ source.append(" \n");
+ source.append(" result[i] = value_result; \n");
+ source.append(" residual[i] = value_residual; \n");
+ source.append(" p[i] = value_p; \n");
+ source.append(" inner_prod_r_r0star += value_residual * value_r0star; \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) \n ");
+ source.append(" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
+
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_pipelined_bicgstab_blocked_prod(StringT & source, std::string const & numeric_string, unsigned int subwarp_size)
+{
+ std::stringstream ss;
+ ss << subwarp_size;
+
+ source.append("__kernel void bicgstab_csr_blocked_prod( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" unsigned int buffer_offset, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+ source.append("{ \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+
+ source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
+ source.append(" const unsigned int block_increment = get_local_size(0) * ((size - 1) / (get_global_size(0)) + 1); \n");
+ source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
+ source.append(" const unsigned int block_stop = min(block_start + block_increment, size); \n");
+
+ source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
+ source.append(" row < block_stop; \n");
+ source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" unsigned int row_end = row_indices[row+1]; \n");
+ source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
+ source.append(" dot_prod += elements[i] * p[column_indices[i]]; \n");
+
+ source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
+ source.append(" #pragma unroll \n");
+ source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
+ source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
+
+ source.append(" if (id_in_row == 0) { \n");
+ source.append(" Ap[row] = shared_elements[get_local_id(0)]; \n");
+ source.append(" inner_prod_ApAp += shared_elements[get_local_id(0)] * shared_elements[get_local_id(0)]; \n");
+ source.append(" inner_prod_pAp += p[row] * shared_elements[get_local_id(0)]; \n");
+ source.append(" inner_prod_r0Ap += r0star[row] * shared_elements[get_local_id(0)]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+ source.append(" } \n");
+
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bicgstab_csr_prod( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const unsigned int * row_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int num_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" unsigned int buffer_offset, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+ source.append("{ \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+
+ source.append(" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
+ source.append(" unsigned int row_start = row_blocks[block_id]; \n");
+ source.append(" unsigned int row_stop = row_blocks[block_id + 1]; \n");
+ source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
+ source.append(" unsigned int element_start = row_indices[row_start]; \n");
+ source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
+
+ source.append(" if (rows_to_process > 1) { \n"); // CSR stream
+ // load to shared buffer:
+ source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+ source.append(" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // use one thread per row to sum:
+ source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
+ source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
+ source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
+ source.append(" dot_prod += shared_elements[i]; \n");
+ source.append(" Ap[row] = dot_prod; \n");
+ source.append(" inner_prod_ApAp += dot_prod * dot_prod; \n");
+ source.append(" inner_prod_pAp += p[row] * dot_prod; \n");
+ source.append(" inner_prod_r0Ap += r0star[row] * dot_prod; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" else \n"); // CSR vector for a single row
+ source.append(" { \n");
+ // load and sum to shared buffer:
+ source.append(" shared_elements[get_local_id(0)] = 0; \n");
+ source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+ source.append(" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
+
+ // reduction to obtain final result
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
+ source.append(" } \n");
+
+ source.append(" if (get_local_id(0) == 0) { \n");
+ source.append(" Ap[row_start] = shared_elements[0]; \n");
+ source.append(" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
+ source.append(" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
+ source.append(" inner_prod_r0Ap += r0star[row_start] * shared_elements[0]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+ source.append(" } \n");
+
+ source.append("} \n \n");
+
+}
+
+template<typename StringT>
+void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bicgstab_coo_prod( \n");
+ source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const uint * group_boundaries, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __local unsigned int * shared_rows, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" unsigned int buffer_offset, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+
+ ///////////// Sparse matrix-vector multiplication part /////////////
+ source.append(" uint2 tmp; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val; \n");
+ source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
+ source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
+ source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
+
+ source.append(" uint local_index = 0; \n");
+
+ source.append(" for (uint k = 0; k < k_end; ++k) { \n");
+ source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
+
+ source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
+ source.append(" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
+
+ //check for carry from previous loop run:
+ source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
+ source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
+ source.append(" val += inter_results[get_local_size(0)-1]; \n");
+ source.append(" else {\n");
+ source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_size(0)-1]; \n");
+ source.append(" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
+ source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+ source.append(" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
+ source.append(" inner_prod_r0Ap += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ //segmented parallel reduction begin
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
+ source.append(" inter_results[get_local_id(0)] = val; \n");
+ source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+ source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" inter_results[get_local_id(0)] += left; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ //segmented parallel reduction end
+
+ source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
+ source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
+ source.append(" Ap[tmp.x] = Ap_entry; \n");
+ source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+ source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
+ source.append(" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" } \n"); //for k
+
+ source.append(" if (local_index + 1 == group_end) {\n"); //write results of last active entry (this may not necessarily be the case already)
+ source.append(" "); source.append(numeric_string); source.append(" Ap_entry = inter_results[get_local_id(0)]; \n");
+ source.append(" Ap[tmp.x] = Ap_entry; \n");
+ source.append(" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
+ source.append(" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
+ source.append(" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+ source.append(" } \n");
+
+ source.append("} \n \n");
+
+}
+
+
+template<typename StringT>
+void generate_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bicgstab_ell_prod( \n");
+ source.append(" __global const unsigned int * coords, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int internal_row_num, \n");
+ source.append(" unsigned int items_per_row, \n");
+ source.append(" unsigned int aligned_items_per_row, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" unsigned int buffer_offset, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint offset = row; \n");
+ source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = elements[offset]; \n");
+ source.append(" sum += (val != 0) ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(")0; \n");
+ source.append(" } \n");
+
+ source.append(" Ap[row] = sum; \n");
+ source.append(" inner_prod_ApAp += sum * sum; \n");
+ source.append(" inner_prod_pAp += p[row] * sum; \n");
+ source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+ source.append(" } \n");
+ source.append("} \n \n");
+}
+
+template<typename StringT>
+void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bicgstab_sliced_ell_prod( \n");
+ source.append(" __global const unsigned int * columns_per_block, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const unsigned int * block_start, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int block_size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" unsigned int buffer_offset, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+ source.append(" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
+ source.append(" uint id_in_block = get_local_id(0) % block_size; \n");
+ source.append(" uint num_blocks = (size - 1) / block_size + 1; \n");
+ source.append(" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
+ source.append(" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
+
+ source.append(" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint row = block_idx * block_size + id_in_block; \n");
+ source.append(" uint offset = block_start[block_idx]; \n");
+ source.append(" uint num_columns = columns_per_block[block_idx]; \n");
+ source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
+ source.append(" uint index = offset + item_id * block_size + id_in_block; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
+ source.append(" sum += (val != 0) ? (p[column_indices[index]] * val) : 0; \n");
+ source.append(" } \n");
+
+ source.append(" if (row < size) {\n");
+ source.append(" Ap[row] = sum; \n");
+ source.append(" inner_prod_ApAp += sum * sum; \n");
+ source.append(" inner_prod_pAp += p[row] * sum; \n");
+ source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+ source.append(" } \n");
+ source.append("} \n \n");
+}
+
+template<typename StringT>
+void generate_hyb_matrix_pipelined_bicgstab_prod(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void bicgstab_hyb_prod( \n");
+ source.append(" const __global int* ell_coords, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append("* ell_elements, \n");
+ source.append(" const __global uint* csr_rows, \n");
+ source.append(" const __global uint* csr_cols, \n");
+ source.append(" const __global "); source.append(numeric_string); source.append("* csr_elements, \n");
+ source.append(" unsigned int internal_row_num, \n");
+ source.append(" unsigned int items_per_row, \n");
+ source.append(" unsigned int aligned_items_per_row, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * r0star, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" unsigned int buffer_offset, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_r0Ap) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_ApAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_pAp = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_r0Ap = 0; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint glb_sz = get_global_size(0); \n");
+
+ source.append(" for (uint row = glb_id; row < size; row += glb_sz) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint offset = row; \n");
+ source.append(" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = ell_elements[offset]; \n");
+ source.append(" sum += (val != 0) ? (p[ell_coords[offset]] * val) : 0; \n");
+ source.append(" } \n");
+
+ source.append(" uint col_begin = csr_rows[row]; \n");
+ source.append(" uint col_end = csr_rows[row + 1]; \n");
+
+ source.append(" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
+ source.append(" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
+ source.append(" } \n");
+
+ source.append(" Ap[row] = sum; \n");
+ source.append(" inner_prod_ApAp += sum * sum; \n");
+ source.append(" inner_prod_pAp += p[row] * sum; \n");
+ source.append(" inner_prod_r0Ap += r0star[row] * sum; \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
+ source.append(" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) { \n ");
+ source.append(" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
+ source.append(" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
+ source.append(" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
+ source.append(" } \n");
+ source.append("} \n \n");
+}
+
+//////////////////////////////
+
+
+template <typename StringType>
+void generate_pipelined_gmres_gram_schmidt_stage1(StringType & source, std::string const & numeric_string, bool is_nvidia)
+{
+ source.append("__kernel void gmres_gram_schmidt_1( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int internal_size, \n");
+ source.append(" unsigned int k, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vi_in_vk_buffer, \n");
+ source.append(" unsigned int chunk_size) \n");
+ source.append("{ \n");
+
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_array[7*128]; \n");
+ if (!is_nvidia) // use of thread-local variables entails a 2x performance drop on NVIDIA GPUs, but is faster an AMD
+ {
+ source.append(" "); source.append(numeric_string); source.append(" vi_in_vk[7]; \n");
+ }
+ source.append(" "); source.append(numeric_string); source.append(" value_vk = 0; \n");
+
+ source.append(" unsigned int k_base = 0; \n");
+ source.append(" while (k_base < k) { \n");
+ source.append(" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
+
+ if (is_nvidia)
+ {
+ source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
+ source.append(" shared_array[get_local_id(0) + j*chunk_size] = 0; \n");
+ }
+ else
+ {
+ source.append(" vi_in_vk[0] = 0;\n");
+ source.append(" vi_in_vk[1] = 0;\n");
+ source.append(" vi_in_vk[2] = 0;\n");
+ source.append(" vi_in_vk[3] = 0;\n");
+ source.append(" vi_in_vk[4] = 0;\n");
+ source.append(" vi_in_vk[5] = 0;\n");
+ source.append(" vi_in_vk[6] = 0;\n");
+ }
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" value_vk = krylov_basis[i + k * internal_size]; \n");
+ source.append(" \n");
+ source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
+ if (is_nvidia)
+ source.append(" shared_array[get_local_id(0) + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
+ else
+ source.append(" vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group
+ if (!is_nvidia)
+ {
+ source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
+ source.append(" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
+ }
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
+ source.append(" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) \n ");
+ source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
+ source.append(" vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
+
+ source.append(" k_base += vecs_in_iteration; \n");
+ source.append(" } \n");
+
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_pipelined_gmres_gram_schmidt_stage2(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void gmres_gram_schmidt_2( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * krylov_basis, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int internal_size, \n");
+ source.append(" unsigned int k, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * vi_in_vk_buffer, \n");
+ source.append(" unsigned int chunk_size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
+ source.append(" unsigned int krylov_dim, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
+ source.append("{ \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" vk_dot_vk = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_vk = 0; \n");
+
+ source.append(" unsigned int k_base = 0; \n");
+ source.append(" while (k_base < k) { \n");
+ source.append(" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
+
+ // parallel reduction in work group for <v_i, v_k>
+ source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
+ source.append(" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) { \n");
+ source.append(" for (uint j=0; j<vecs_in_iteration; ++j) \n");
+ source.append(" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
+ source.append(" } ");
+ source.append(" } ");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // v_k -= <v_i, v_k> v_i:
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" value_vk = krylov_basis[i + k * internal_size]; \n");
+ source.append(" \n");
+ source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
+ source.append(" value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
+ source.append(" vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0; \n");
+ source.append(" krylov_basis[i + k * internal_size] = value_vk; \n");
+ source.append(" } \n");
+
+ // write to R: (to avoid thread divergence, all threads write the same value)
+ source.append(" if (get_group_id(0) == 0) \n");
+ source.append(" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
+ source.append(" R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" k_base += vecs_in_iteration; \n");
+ source.append(" } \n");
+
+ // parallel reduction in work group for <v_k, v_k>
+ source.append(" shared_array[get_local_id(0)] = vk_dot_vk; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) \n ");
+ source.append(" inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
+
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_pipelined_gmres_normalize_vk(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void gmres_normalize_vk( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vk, \n");
+ source.append(" unsigned int vk_offset, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * residual, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_buffer, \n");
+ source.append(" unsigned int R_offset, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * inner_prod_buffer, \n");
+ source.append(" unsigned int chunk_size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * r_dot_vk_buffer, \n");
+ source.append(" unsigned int chunk_offset, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array) \n");
+ source.append("{ \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" norm_vk = 0; \n");
+
+ // parallel reduction in work group to compute <vk, vk>
+ source.append(" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+
+ // compute alpha from reduced values:
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" norm_vk = sqrt(shared_array[0]); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" inner_prod_contrib = 0; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_vk = vk[i + vk_offset] / norm_vk; \n");
+ source.append(" \n");
+ source.append(" inner_prod_contrib += residual[i] * value_vk; \n");
+ source.append(" \n");
+ source.append(" vk[i + vk_offset] = value_vk; \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // parallel reduction in work group
+ source.append(" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
+ source.append(" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
+ source.append(" { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
+ source.append(" } ");
+
+ // write results to result array
+ source.append(" if (get_local_id(0) == 0) \n ");
+ source.append(" r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
+ source.append(" if (get_global_id(0) == 0) \n ");
+ source.append(" R_buffer[R_offset] = norm_vk; \n");
+
+ source.append("} \n");
+
+}
+
+template <typename StringType>
+void generate_pipelined_gmres_update_result(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void gmres_update_result( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * residual, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * krylov_basis, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int internal_size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const * coefficients, \n");
+ source.append(" unsigned int k) \n");
+ source.append("{ \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_result = result[i] + coefficients[0] * residual[i]; \n");
+ source.append(" \n");
+ source.append(" for (unsigned int j = 1; j < k; ++j) \n");
+ source.append(" value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
+ source.append(" \n");
+ source.append(" result[i] = value_result; \n");
+ source.append(" } \n");
+
+ source.append("} \n");
+}
+
+
+template <typename StringType>
+void generate_compressed_matrix_pipelined_gmres_blocked_prod(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void gmres_csr_blocked_prod( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" unsigned int offset_p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int offset_Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+ source.append("{ \n");
+ source.append(" cg_csr_blocked_prod(row_indices, column_indices, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
+ source.append("} \n \n");
+
+}
+
+template <typename StringType>
+void generate_compressed_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void gmres_csr_prod( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const unsigned int * row_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int num_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" unsigned int offset_p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int offset_Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_elements) \n");
+ source.append("{ \n");
+ source.append(" cg_csr_prod(row_indices, column_indices, row_blocks, elements, num_blocks, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp, shared_elements); \n");
+ source.append("} \n \n");
+
+}
+
+template <typename StringType>
+void generate_coordinate_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void gmres_coo_prod( \n");
+ source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const uint * group_boundaries, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * p, \n");
+ source.append(" unsigned int offset_p, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * Ap, \n");
+ source.append(" unsigned int offset_Ap, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" __local unsigned int * shared_rows, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * inter_results, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * inner_prod_buffer, \n");
+ source.append(" unsigned int buffer_size, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_ApAp, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * shared_array_pAp) \n");
+ source.append("{ \n");
+ source.append(" cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
+ source.append("} \n \n");
+
+}
+
+
+template <typename StringType>
+void generate_ell_matrix_pipelined_gmres_prod(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void gmres_ell_prod( \n");
+ source.append(" __global const unsigned int * coords, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int internal_row_num, \n");
+ source.append(" unsigned int items_per_row, \n");
+ source.append(" unsigned int aligned_items_per_row, \n");
+ source.append(" __global const "); source.app
<TRUNCATED>
[28/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
new file mode 100644
index 0000000..b7eaeb4
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
@@ -0,0 +1,3252 @@
+#ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/vector_operations.hpp
+ @brief Implementations of vector operations using a plain single-threaded execution on CPU
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_kernel(DestNumericT * dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest,
+ SrcNumericT const * src, unsigned int start_src, unsigned int inc_src)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size_dest;
+ i += gridDim.x * blockDim.x)
+ dest[i*inc_dest+start_dest] = src[i*inc_src+start_src];
+}
+
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+ convert_kernel<<<128, 128>>>(viennacl::cuda_arg(dest),
+ static_cast<unsigned int>(viennacl::traits::start(dest)),
+ static_cast<unsigned int>(viennacl::traits::stride(dest)),
+ static_cast<unsigned int>(viennacl::traits::size(dest)),
+
+ viennacl::cuda_arg(src),
+ static_cast<unsigned int>(viennacl::traits::start(src)),
+ static_cast<unsigned int>(viennacl::traits::stride(src)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("convert_kernel");
+}
+
+
+//////////////////////// av /////////////////////////////
+
+// gpu scalar
+template<typename NumericT>
+__global__ void av_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ if (options2 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+ }
+}
+
+// cpu scalar
+template<typename NumericT>
+__global__ void av_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ if (options2 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+ }
+}
+
+
+
+template<typename NumericT, typename ScalarType1>
+void av(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarType1>::value)
+ temporary_alpha = alpha;
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+
+///////////////////// avbv //////////////////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<ScalarT2>::value)
+ temporary_beta = beta;
+
+
+ avbv_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(vec3),
+ static_cast<unsigned int>(viennacl::traits::start(vec3)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
+}
+
+
+////////////////////////// avbv_v //////////////////////////////////////
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<ScalarT2>::value)
+ temporary_beta = beta;
+
+
+ avbv_v_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(vec3),
+ static_cast<unsigned int>(viennacl::traits::start(vec3)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+}
+
+
+//////////////////////////
+
+template<typename NumericT>
+__global__ void vector_assign_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ unsigned int internal_size1,
+
+ NumericT alpha)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = (i < size1) ? alpha : 0;
+}
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1 The vector to which the value should be assigned
+* @param alpha The value to be assigned
+* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template<typename NumericT, typename ScalarT1>
+void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool up_to_internal_size = false)
+{
+ typedef NumericT value_type;
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+ unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
+
+ vector_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ size,
+ static_cast<unsigned int>(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_assign_kernel");
+}
+
+//////////////////////////
+
+template<typename NumericT>
+__global__ void vector_swap_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2)
+{
+ NumericT tmp;
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ tmp = vec2[i*inc2+start2];
+ vec2[i*inc2+start2] = vec1[i*inc1+start1];
+ vec1[i*inc1+start1] = tmp;
+ }
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1 The first vector (or -range, or -slice)
+* @param vec2 The second vector (or -range, or -slice)
+*/
+template<typename NumericT>
+void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+{
+ vector_swap_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
+}
+
+///////////////////////// Binary Elementwise operations /////////////
+
+template<typename NumericT>
+__global__ void element_op_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT const * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT const * vec3,
+ unsigned int start3,
+ unsigned int inc3,
+
+ unsigned int op_type
+ )
+{
+ if (op_type == 2)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
+ }
+ }
+ else if (op_type == 1)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+ }
+ }
+ else if (op_type == 0)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+ }
+ }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT const * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT const * vec3,
+ unsigned int start3,
+ unsigned int inc3,
+
+ unsigned int op_type
+ )
+{
+ if (op_type == 1)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+ }
+ }
+ else if (op_type == 0)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+ }
+ }
+}
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax)
+*
+* @param vec1 The result vector (or -range, or -slice)
+* @param proxy The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ element_op_int_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+template<typename OpT>
+void element_op(vector_base<float> & vec1,
+ vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OpT> > const & proxy)
+{
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+template<typename OpT>
+void element_op(vector_base<double> & vec1,
+ vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OpT> > const & proxy)
+{
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+///////////////////////// Unary Elementwise operations /////////////
+
+// Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
+// Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
+
+// acos
+template<typename NumericT>
+__global__ void vec_element_acos_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_acos> > const & proxy)
+{
+ typedef NumericT value_type;
+
+ vec_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
+}
+
+// asin
+template<typename NumericT>
+__global__ void vec_element_asin_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_asin> > const & proxy)
+{
+ vec_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void vec_element_atan_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_atan> > const & proxy)
+{
+ vec_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void vec_element_ceil_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
+{
+ vec_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void vec_element_cos_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cos> > const & proxy)
+{
+ vec_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void vec_element_cosh_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
+{
+ vec_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void vec_element_exp_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_exp> > const & proxy)
+{
+ vec_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void vec_element_fabs_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
+{
+ vec_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
+}
+
+// abs
+template<typename NumericT>
+__global__ void vec_element_abs_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_abs> > const & proxy)
+{
+ vec_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
+}
+
+
+
+// floor
+template<typename NumericT>
+__global__ void vec_element_floor_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_floor> > const & proxy)
+{
+ vec_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
+}
+
+
+// log
+template<typename NumericT>
+__global__ void vec_element_log_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log> > const & proxy)
+{
+ vec_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void vec_element_log10_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log10> > const & proxy)
+{
+ vec_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void vec_element_sin_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sin> > const & proxy)
+{
+ vec_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void vec_element_sinh_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
+{
+ vec_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void vec_element_sqrt_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
+{
+ vec_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void vec_element_tan_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tan> > const & proxy)
+{
+ vec_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void vec_element_tanh_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tanh> > const & proxy)
+{
+ vec_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
+}
+
+
+
+///////////////////////// Norms and inner product ///////////////////
+
+
+template<typename NumericT>
+__global__ void inner_prod_kernel(const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+ unsigned int size2,
+ NumericT * group_buffer)
+{
+ __shared__ NumericT tmp_buffer[128];
+ unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
+ unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
+
+ unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
+ - ( blockIdx.x * size1) / (gridDim.x);
+
+
+ NumericT tmp = 0;
+ for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
+ tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
+ tmp_buffer[threadIdx.x] = tmp;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
+ }
+
+ if (threadIdx.x == 0)
+ group_buffer[blockIdx.x] = tmp_buffer[0];
+
+}
+
+
+
+// sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+template<typename NumericT>
+__global__ void vector_sum_kernel_floats(
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
+ NumericT * result)
+{
+ __shared__ NumericT tmp_buffer[128];
+ NumericT thread_sum = 0;
+ for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+ {
+ if (option > 0)
+ thread_sum += vec1[i*inc1+start1];
+ else
+ thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
+ }
+
+ tmp_buffer[threadIdx.x] = thread_sum;
+
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ if (option > 0)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+ else
+ tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
+ }
+ }
+
+ if (threadIdx.x == 0)
+ {
+ if (option == 2)
+ *result = sqrt(tmp_buffer[0]);
+ else
+ *result = tmp_buffer[0];
+ }
+}
+
+template<typename NumericT>
+__global__ void vector_sum_kernel_integers(
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ unsigned int option, //0: use max, 1: just sum
+ NumericT * result)
+{
+ __shared__ NumericT tmp_buffer[128];
+ NumericT thread_sum = 0;
+ for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+ {
+ if (option > 0)
+ thread_sum += vec1[i*inc1+start1];
+ else
+ thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
+ }
+
+ tmp_buffer[threadIdx.x] = thread_sum;
+
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ if (option > 0)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+ else
+ tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+ }
+ }
+
+ if (threadIdx.x == 0)
+ *result = tmp_buffer[0];
+}
+
+template<typename NumericT>
+__global__ void vector_sum_kernel_unsigned_integers(
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ unsigned int option, //0: use max, 1: just sum
+ NumericT * result)
+{
+ __shared__ NumericT tmp_buffer[128];
+ NumericT thread_sum = 0;
+ for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+ {
+ if (option > 0)
+ thread_sum += vec1[i*inc1+start1];
+ else
+ thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
+ }
+
+ tmp_buffer[threadIdx.x] = thread_sum;
+
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ if (option > 0)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+ else
+ tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+ }
+ }
+
+ if (threadIdx.x == 0)
+ *result = tmp_buffer[0];
+}
+
+namespace detail
+{
+ /** \cond */
+ struct vector_sum_kernel_launcher_integers
+ {
+ template<typename NumericT, typename ScalarT>
+ static void apply(vector_base<NumericT> const & temp,
+ unsigned int option,
+ ScalarT & result)
+ {
+ typedef NumericT value_type;
+ vector_sum_kernel_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
+ static_cast<unsigned int>(viennacl::traits::start(temp)),
+ static_cast<unsigned int>(viennacl::traits::stride(temp)),
+ static_cast<unsigned int>(viennacl::traits::size(temp)),
+ static_cast<unsigned int>(option),
+ viennacl::cuda_arg(result) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+ }
+ };
+
+ struct vector_sum_kernel_launcher_unsigned_integers
+ {
+ template<typename NumericT, typename ScalarT>
+ static void apply(vector_base<NumericT> const & temp,
+ unsigned int option,
+ ScalarT & result)
+ {
+ typedef NumericT value_type;
+ vector_sum_kernel_unsigned_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
+ static_cast<unsigned int>(viennacl::traits::start(temp)),
+ static_cast<unsigned int>(viennacl::traits::stride(temp)),
+ static_cast<unsigned int>(viennacl::traits::size(temp)),
+ static_cast<unsigned int>(option),
+ viennacl::cuda_arg(result) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+ }
+ };
+
+ struct vector_sum_kernel_launcher_floats
+ {
+ template<typename NumericT, typename ScalarT>
+ static void apply(vector_base<NumericT> const & temp,
+ unsigned int option,
+ ScalarT & result)
+ {
+ typedef NumericT value_type;
+ vector_sum_kernel_floats<<<1, 128>>>(viennacl::cuda_arg(temp),
+ static_cast<unsigned int>(viennacl::traits::start(temp)),
+ static_cast<unsigned int>(viennacl::traits::stride(temp)),
+ static_cast<unsigned int>(viennacl::traits::size(temp)),
+ static_cast<unsigned int>(option),
+ viennacl::cuda_arg(result) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+ }
+ };
+
+ template<typename NumericT>
+ struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<unsigned char> : public vector_sum_kernel_launcher_unsigned_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<unsigned short> : public vector_sum_kernel_launcher_unsigned_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<unsigned int> : public vector_sum_kernel_launcher_unsigned_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<unsigned long> : public vector_sum_kernel_launcher_unsigned_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<float> : public vector_sum_kernel_launcher_floats {};
+
+ template<>
+ struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
+
+ /** \endcond */
+}
+
+
+//implementation of inner product:
+//namespace {
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template<typename NumericT, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> const & vec2,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ static const unsigned int work_groups = 128;
+ static viennacl::vector<value_type> temp(work_groups);
+
+ inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+ static_cast<unsigned int>(viennacl::traits::size(vec2)),
+ viennacl::cuda_arg(temp)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+ detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
+}
+
+
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the host)
+*/
+template<typename NumericT>
+void inner_prod_cpu(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> const & vec2,
+ NumericT & result)
+{
+ typedef NumericT value_type;
+
+ const unsigned int work_groups = 128;
+ viennacl::vector<value_type> temp(work_groups);
+
+ inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+ static_cast<unsigned int>(viennacl::traits::size(vec2)),
+ viennacl::cuda_arg(temp)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+ // Now copy partial results from GPU back to CPU and run reduction there:
+ std::vector<value_type> temp_cpu(work_groups);
+ viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+ result = 0;
+ for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+ result += *it;
+}
+
+///////////////////////////////////
+
+#define VIENNACL_MDOT_WORKGROUP_SIZE 128
+#define VIENNACL_MDOT_WORKGROUP_NUM 128
+// M = 2:
+template<typename NumericT>
+__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
+ const NumericT *y0, unsigned int start0, unsigned int stride0,
+ const NumericT *y1, unsigned int start1, unsigned int stride1,
+ NumericT *group_results)
+{
+ __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
+ unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+ unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+ unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
+
+ NumericT entry_x = 0;
+ NumericT group_sum0 = 0;
+ NumericT group_sum1 = 0;
+ for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+ entry_x = x[i * stridex + startx]; // load only once from global memory!
+ group_sum0 += entry_x * y0[i * stride0 + start0];
+ group_sum1 += entry_x * y1[i * stride1 + start1];
+ }
+ tmp_buffer[threadIdx.x] = group_sum0;
+ tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
+ tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+ }
+ }
+
+ // write result of group to group_results
+ if (threadIdx.x == 0) {
+ group_results[blockIdx.x] = tmp_buffer[0];
+ group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
+ }
+}
+
+// M = 3:
+template<typename NumericT>
+__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
+ const NumericT *y0, unsigned int start0, unsigned int stride0,
+ const NumericT *y1, unsigned int start1, unsigned int stride1,
+ const NumericT *y2, unsigned int start2, unsigned int stride2,
+ NumericT *group_results)
+{
+ __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
+ unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+ unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+ unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+ NumericT entry_x = 0;
+ NumericT group_sum0 = 0;
+ NumericT group_sum1 = 0;
+ NumericT group_sum2 = 0;
+ for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+ entry_x = x[i * stridex + startx]; // load only once from global memory!
+ group_sum0 += entry_x * y0[i * stride0 + start0];
+ group_sum1 += entry_x * y1[i * stride1 + start1];
+ group_sum2 += entry_x * y2[i * stride2 + start2];
+ }
+ tmp_buffer[threadIdx.x] = group_sum0;
+ tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
+ tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+ }
+ }
+
+ // write result of group to group_results
+ if (threadIdx.x == 0) {
+ group_results[blockIdx.x ] = tmp_buffer[0];
+ group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
+ group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+ }
+}
+
+// M = 4:
+template<typename NumericT>
+__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
+ const NumericT *y0, unsigned int start0, unsigned int stride0,
+ const NumericT *y1, unsigned int start1, unsigned int stride1,
+ const NumericT *y2, unsigned int start2, unsigned int stride2,
+ const NumericT *y3, unsigned int start3, unsigned int stride3,
+ NumericT *group_results)
+{
+ __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
+ unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+ unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+ unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+ NumericT entry_x = 0;
+ NumericT group_sum0 = 0;
+ NumericT group_sum1 = 0;
+ NumericT group_sum2 = 0;
+ NumericT group_sum3 = 0;
+ for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+ entry_x = x[i * stridex + startx]; // load only once from global memory!
+ group_sum0 += entry_x * y0[i * stride0 + start0];
+ group_sum1 += entry_x * y1[i * stride1 + start1];
+ group_sum2 += entry_x * y2[i * stride2 + start2];
+ group_sum3 += entry_x * y3[i * stride3 + start3];
+ }
+ tmp_buffer[threadIdx.x] = group_sum0;
+ tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+ tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
+ tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+ tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+ }
+ }
+
+ // write result of group to group_results
+ if (threadIdx.x == 0) {
+ group_results[blockIdx.x ] = tmp_buffer[0];
+ group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
+ group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+ group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+ }
+}
+
+// M = 8:
+template<typename NumericT>
+__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
+ const NumericT *y0, unsigned int start0, unsigned int stride0,
+ const NumericT *y1, unsigned int start1, unsigned int stride1,
+ const NumericT *y2, unsigned int start2, unsigned int stride2,
+ const NumericT *y3, unsigned int start3, unsigned int stride3,
+ const NumericT *y4, unsigned int start4, unsigned int stride4,
+ const NumericT *y5, unsigned int start5, unsigned int stride5,
+ const NumericT *y6, unsigned int start6, unsigned int stride6,
+ const NumericT *y7, unsigned int start7, unsigned int stride7,
+ NumericT *group_results)
+{
+ __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
+ unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+ unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+ unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+ NumericT entry_x = 0;
+ NumericT group_sum0 = 0;
+ NumericT group_sum1 = 0;
+ NumericT group_sum2 = 0;
+ NumericT group_sum3 = 0;
+ NumericT group_sum4 = 0;
+ NumericT group_sum5 = 0;
+ NumericT group_sum6 = 0;
+ NumericT group_sum7 = 0;
+ for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+ entry_x = x[i * stridex + startx]; // load only once from global memory!
+ group_sum0 += entry_x * y0[i * stride0 + start0];
+ group_sum1 += entry_x * y1[i * stride1 + start1];
+ group_sum2 += entry_x * y2[i * stride2 + start2];
+ group_sum3 += entry_x * y3[i * stride3 + start3];
+ group_sum4 += entry_x * y4[i * stride4 + start4];
+ group_sum5 += entry_x * y5[i * stride5 + start5];
+ group_sum6 += entry_x * y6[i * stride6 + start6];
+ group_sum7 += entry_x * y7[i * stride7 + start7];
+ }
+ tmp_buffer[threadIdx.x] = group_sum0;
+ tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+ tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+ tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
+ tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
+ tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
+ tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
+ tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+ tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+ tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
+ tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
+ tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
+ tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
+ }
+ }
+
+ // write result of group to group_results
+ if (threadIdx.x == 0) {
+ group_results[blockIdx.x ] = tmp_buffer[0];
+ group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
+ group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+ group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+ group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
+ group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
+ group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
+ group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
+ }
+}
+
+// sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+template<typename NumericT>
+__global__ void vector_multi_sum_kernel(
+ NumericT const * vec1,
+ NumericT * result,
+ unsigned int start_result,
+ unsigned int inc_result)
+{
+ __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
+
+ tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
+
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+ }
+
+ if (threadIdx.x == 0)
+ result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
+}
+
+template<typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+ vector_tuple<NumericT> const & vec_tuple,
+ vector_base<NumericT> & result)
+{
+ typedef NumericT value_type;
+
+ static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM);
+
+ vcl_size_t current_index = 0;
+ while (vec_tuple.const_size() > current_index)
+ {
+ switch (vec_tuple.const_size() - current_index)
+ {
+ case 7:
+ case 6:
+ case 5:
+ case 4:
+ {
+ vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
+ vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+ vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+ vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+
+ inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
+ VIENNACL_MDOT_WORKGROUP_SIZE>>>( viennacl::cuda_arg(x),
+ static_cast<unsigned int>(viennacl::traits::start(x)),
+ static_cast<unsigned int>(viennacl::traits::stride(x)),
+ static_cast<unsigned int>(viennacl::traits::size(x)),
+ viennacl::cuda_arg(y0),
+ static_cast<unsigned int>(viennacl::traits::start(y0)),
+ static_cast<unsigned int>(viennacl::traits::stride(y0)),
+ viennacl::cuda_arg(y1),
+ static_cast<unsigned int>(viennacl::traits::start(y1)),
+ static_cast<unsigned int>(viennacl::traits::stride(y1)),
+ viennacl::cuda_arg(y2),
+ static_cast<unsigned int>(vienna
<TRUNCATED>
[14/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
new file mode 100644
index 0000000..959bbd8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
@@ -0,0 +1,108 @@
+#ifndef VIENNACL_LINALG_NORM_INF_HPP_
+#define VIENNACL_LINALG_NORM_INF_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_inf.hpp
+ @brief Generic interface for the l^infty-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+ //
+ // generic norm_inf function
+ // uses tag dispatch to identify which algorithm
+ // should be called
+ //
+ namespace linalg
+ {
+
+ #ifdef VIENNACL_WITH_UBLAS
+ // ----------------------------------------------------
+ // UBLAS
+ //
+ template< typename VectorT >
+ typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+ typename VectorT::value_type
+ >::type
+ norm_inf(VectorT const& v1)
+ {
+ return boost::numeric::ublas::norm_inf(v1);
+ }
+ #endif
+
+
+ // ----------------------------------------------------
+ // STL
+ //
+ template< typename T, typename A >
+ T norm_inf(std::vector<T, A> const & v1)
+ {
+ //std::cout << "stl .. " << std::endl;
+ T result = 0;
+ for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
+ {
+ if (std::fabs(v1[i]) > result)
+ result = std::fabs(v1[i]);
+ }
+
+ return result;
+ }
+
+ // ----------------------------------------------------
+ // VIENNACL
+ //
+ template< typename ScalarType>
+ viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_norm_inf >
+ norm_inf(viennacl::vector_base<ScalarType> const & v1)
+ {
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_norm_inf >(v1, v1);
+ }
+
+ // with vector expression:
+ template<typename LHS, typename RHS, typename OP>
+ viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_norm_inf>
+ norm_inf(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+ {
+ return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_norm_inf >(vector, vector);
+ }
+
+
+ } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
new file mode 100644
index 0000000..7cdcf89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
@@ -0,0 +1,458 @@
+#ifndef VIENNACL_LINALG_OPENCL_AMG_OPERATIONS_HPP
+#define VIENNACL_LINALG_OPENCL_AMG_OPERATIONS_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file opencl/amg_operations.hpp
+ @brief Implementations of routines for AMG in OpenCL.
+*/
+
+#include <cstdlib>
+#include <cmath>
+#include <map>
+
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/amg.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace amg
+{
+
+
+///////////////////////////////////////////
+
+/** @brief Routine for taking all connections in the matrix as strong */
+template<typename NumericT>
+void amg_influence_trivial(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+ viennacl::ocl::kernel & influence_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_influence_trivial");
+
+ viennacl::ocl::enqueue(influence_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(),
+ cl_uint(A.size1()),
+ cl_uint(A.nnz()),
+ viennacl::traits::opencl_handle(amg_context.influence_jumper_),
+ viennacl::traits::opencl_handle(amg_context.influence_ids_),
+ viennacl::traits::opencl_handle(amg_context.influence_values_)
+ )
+ );
+}
+
+
+/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */
+template<typename NumericT>
+void amg_influence_advanced(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)A; (void)amg_context; (void)tag;
+ throw std::runtime_error("amg_influence_advanced() not implemented for OpenCL yet");
+}
+
+
+/** @brief Dispatcher for influence processing */
+template<typename NumericT>
+void amg_influence(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ // TODO: dispatch based on influence tolerance provided
+ amg_influence_trivial(A, amg_context, tag);
+}
+
+
+
+/** @brief Assign IDs to coarse points.
+*
+* TODO: Use exclusive_scan on GPU for this.
+*/
+inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context)
+{
+ viennacl::backend::typesafe_host_array<unsigned int> point_types(amg_context.point_types_.handle(), amg_context.point_types_.size());
+ viennacl::backend::typesafe_host_array<unsigned int> coarse_ids(amg_context.coarse_id_.handle(), amg_context.coarse_id_.size());
+ viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get());
+ viennacl::backend::memory_read(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
+
+ unsigned int coarse_id = 0;
+ for (std::size_t i=0; i<amg_context.point_types_.size(); ++i)
+ {
+ coarse_ids.set(i, coarse_id);
+ if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ ++coarse_id;
+ }
+
+ amg_context.num_coarse_ = coarse_id;
+
+ viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
+}
+
+
+//////////////////////////////////////
+
+
+
+/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1
+*
+* @param A Operator matrix on all levels
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+ viennacl::vector<unsigned int> random_weights(A.size1(), viennacl::context(viennacl::MAIN_MEMORY));
+ unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle());
+ for (std::size_t i=0; i<random_weights.size(); ++i)
+ random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1());
+ random_weights.switch_memory_context(viennacl::traits::context(A));
+
+ // work vectors:
+ viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A));
+
+ viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A));
+
+ unsigned int num_undecided = static_cast<unsigned int>(A.size1());
+ viennacl::vector<unsigned int> undecided_buffer(256, viennacl::traits::context(A));
+ viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size());
+
+ viennacl::ocl::kernel & init_workdata_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_init_workdata");
+ viennacl::ocl::kernel & max_neighborhood_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_max_neighborhood");
+ viennacl::ocl::kernel & mark_mis_nodes_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_mark_mis_nodes");
+ viennacl::ocl::kernel & reset_state_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_reset_state");
+
+ unsigned int pmis_iters = 0;
+ while (num_undecided > 0)
+ {
+ ++pmis_iters;
+
+ //
+ // init temporary work data:
+ //
+ viennacl::ocl::enqueue(init_workdata_kernel(work_state, work_random, work_index,
+ amg_context.point_types_,
+ random_weights,
+ cl_uint(A.size1())
+ )
+ );
+
+ //
+ // Propagate maximum tuple twice
+ //
+ for (unsigned int r = 0; r < 2; ++r)
+ {
+ // max operation
+ viennacl::ocl::enqueue(max_neighborhood_kernel(work_state, work_random, work_index,
+ work_state2, work_random2, work_index2,
+ amg_context.influence_jumper_, amg_context.influence_ids_,
+ cl_uint(A.size1())
+ )
+ );
+
+ // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy)
+ work_state = work_state2;
+ work_random = work_random2;
+ work_index = work_index2;
+ }
+
+ //
+ // mark MIS and non-MIS nodes:
+ //
+ viennacl::ocl::enqueue(mark_mis_nodes_kernel(work_state, work_index,
+ amg_context.point_types_,
+ undecided_buffer,
+ cl_uint(A.size1())
+ )
+ );
+
+ // get number of undecided points on host:
+ viennacl::backend::memory_read(undecided_buffer.handle(), 0, undecided_buffer_host.raw_size(), undecided_buffer_host.get());
+ num_undecided = 0;
+ for (std::size_t i=0; i<undecided_buffer.size(); ++i)
+ num_undecided += undecided_buffer_host[i];
+
+ } //while
+
+ viennacl::ocl::enqueue(reset_state_kernel(amg_context.point_types_, cl_uint(amg_context.point_types_.size()) ) );
+}
+
+
+
+/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG)
+*
+* @param A Operator matrix
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+ amg_influence_trivial(A, amg_context, tag);
+
+ //
+ // Stage 1: Build aggregates:
+ //
+ if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION)
+ amg_coarse_ag_stage1_mis2(A, amg_context, tag);
+ else
+ throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with OpenCL backend!");
+
+ viennacl::linalg::opencl::amg::enumerate_coarse_points(amg_context);
+
+ //
+ // Stage 2: Propagate coarse aggregate indices to neighbors:
+ //
+ viennacl::ocl::kernel & propagate_coarse_indices = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_propagate_coarse_indices");
+ viennacl::ocl::enqueue(propagate_coarse_indices(amg_context.point_types_,
+ amg_context.coarse_id_,
+ amg_context.influence_jumper_,
+ amg_context.influence_ids_,
+ cl_uint(A.size1())
+ )
+ );
+
+ //
+ // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy
+ //
+ viennacl::ocl::kernel & merge_undecided = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_merge_undecided");
+ viennacl::ocl::enqueue(merge_undecided(amg_context.point_types_,
+ amg_context.coarse_id_,
+ amg_context.influence_jumper_,
+ amg_context.influence_ids_,
+ cl_uint(A.size1())
+ )
+ );
+
+ //
+ // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3)
+ // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution).
+ //
+ viennacl::ocl::kernel & merge_undecided_2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_merge_undecided_2");
+ viennacl::ocl::enqueue(merge_undecided_2(amg_context.point_types_, cl_uint(A.size1()) ) );
+
+}
+
+
+
+
+/** @brief Calls the right coarsening procedure
+*
+* @param A Operator matrix on all levels
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename InternalT1>
+void amg_coarse(InternalT1 & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ switch (tag.get_coarsening_method())
+ {
+ case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break;
+ default: throw std::runtime_error("not implemented yet");
+ }
+}
+
+
+
+
+////////////////////////////////////// Interpolation /////////////////////////////
+
+
+/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_ag(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+ (void)tag;
+ P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+ // build matrix here
+ viennacl::ocl::kernel & interpolate_ag = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_interpol_ag");
+ viennacl::ocl::enqueue(interpolate_ag(P.handle1().opencl_handle(),
+ P.handle2().opencl_handle(),
+ P.handle().opencl_handle(),
+ amg_context.coarse_id_,
+ cl_uint(A.size1())
+ )
+ );
+
+ P.generate_row_block_information();
+}
+
+/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_sa(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+ (void)tag;
+ viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+ // form tentative operator:
+ amg_interpol_ag(A, P_tentative, amg_context, tag);
+
+ viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+ viennacl::ocl::kernel & interpol_sa = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_interpol_sa");
+ viennacl::ocl::enqueue(interpol_sa(A.handle1().opencl_handle(),
+ A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ cl_uint(A.size1()),
+ cl_uint(A.nnz()),
+ Jacobi.handle1().opencl_handle(),
+ Jacobi.handle2().opencl_handle(),
+ Jacobi.handle().opencl_handle(),
+ NumericT(tag.get_jacobi_weight())
+ )
+ );
+
+ P = viennacl::linalg::prod(Jacobi, P_tentative);
+
+ P.generate_row_block_information();
+}
+
+/** @brief Dispatcher for building the interpolation matrix
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename MatrixT>
+void amg_interpol(MatrixT const & A,
+ MatrixT & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ switch (tag.get_interpolation_method())
+ {
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break;
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa (A, P, amg_context, tag); break;
+ default: throw std::runtime_error("Not implemented yet!");
+ }
+}
+
+/** Assign sparse matrix A to dense matrix B */
+template<typename NumericT, unsigned int AlignmentV>
+void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ viennacl::matrix_base<NumericT> & B)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(),
+ "assign_to_dense");
+
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(B),
+ cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)),
+ cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)),
+ cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)),
+ cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)) ));
+
+}
+
+/** @brief Jacobi Smoother (OpenCL version)
+*
+* @param iterations Number of smoother iterations
+* @param A Operator matrix for the smoothing
+* @param x The vector smoothing is applied to
+* @param x_backup (Different) Vector holding the same values as x
+* @param rhs_smooth The right hand side of the equation for the smoother
+* @param weight Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration
+*/
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+ compressed_matrix<NumericT> const & A,
+ vector<NumericT> & x,
+ vector<NumericT> & x_backup,
+ vector<NumericT> const & rhs_smooth,
+ NumericT weight)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "jacobi");
+
+ for (unsigned int i=0; i<iterations; ++i)
+ {
+ x_backup = x;
+
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ static_cast<NumericT>(weight),
+ viennacl::traits::opencl_handle(x_backup),
+ viennacl::traits::opencl_handle(x),
+ viennacl::traits::opencl_handle(rhs_smooth),
+ static_cast<cl_uint>(rhs_smooth.size())));
+
+ }
+}
+
+
+} //namespace amg
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..2fcd6fa
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
@@ -0,0 +1,177 @@
+#ifndef VIENNACL_LINALG_OPENCL_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_OPENCL_BISECT_KERNEL_CALLS_HPP_
+
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/opencl/bisect_kernel_calls.hpp
+ @brief OpenCL kernel calls for the bisection algorithm
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/opencl/kernels/bisect.hpp"
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+const std::string BISECT_KERNEL_SMALL = "bisectKernelSmall";
+const std::string BISECT_KERNEL_LARGE = "bisectKernelLarge";
+const std::string BISECT_KERNEL_LARGE_ONE_INTERVALS = "bisectKernelLarge_OneIntervals";
+const std::string BISECT_KERNEL_LARGE_MULT_INTERVALS = "bisectKernelLarge_MultIntervals";
+
+template<typename NumericT>
+void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input,
+ viennacl::linalg::detail::ResultDataSmall<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+ viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_SMALL);
+ kernel.global_work_size(0, 1 * VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX);
+ kernel.local_work_size(0, VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX);
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+ viennacl::traits::opencl_handle(input.g_b),
+ static_cast<cl_uint>(mat_size),
+ viennacl::traits::opencl_handle(result.vcl_g_left),
+ viennacl::traits::opencl_handle(result.vcl_g_right),
+ viennacl::traits::opencl_handle(result.vcl_g_left_count),
+ viennacl::traits::opencl_handle(result.vcl_g_right_count),
+ static_cast<NumericT>(lg),
+ static_cast<NumericT>(ug),
+ static_cast<cl_uint>(0),
+ static_cast<cl_uint>(mat_size),
+ static_cast<NumericT>(precision)
+ ));
+
+ }
+
+template<typename NumericT>
+void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input,
+ viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+ viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE);
+ kernel.global_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); // Use only 128 threads for 256 < n <= 512, this
+ kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); // is reasoned
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+ viennacl::traits::opencl_handle(input.g_b),
+ static_cast<cl_uint>(mat_size),
+ static_cast<NumericT>(lg),
+ static_cast<NumericT>(ug),
+ static_cast<cl_uint>(0),
+ static_cast<cl_uint>(mat_size),
+ static_cast<NumericT>(precision),
+ viennacl::traits::opencl_handle(result.g_num_one),
+ viennacl::traits::opencl_handle(result.g_num_blocks_mult),
+ viennacl::traits::opencl_handle(result.g_left_one),
+ viennacl::traits::opencl_handle(result.g_right_one),
+ viennacl::traits::opencl_handle(result.g_pos_one),
+ viennacl::traits::opencl_handle(result.g_left_mult),
+ viennacl::traits::opencl_handle(result.g_right_mult),
+ viennacl::traits::opencl_handle(result.g_left_count_mult),
+ viennacl::traits::opencl_handle(result.g_right_count_mult),
+ viennacl::traits::opencl_handle(result.g_blocks_mult),
+ viennacl::traits::opencl_handle(result.g_blocks_mult_sum)
+ ));
+
+ }
+
+template<typename NumericT>
+void bisectLargeOneIntervals(const viennacl::linalg::detail::InputData<NumericT> &input,
+ viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT precision)
+ {
+ unsigned int num_one_intervals = result.g_num_one;
+ unsigned int num_blocks = viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals,
+ mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK: VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+ viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE_ONE_INTERVALS);
+ kernel.global_work_size(0, num_blocks * (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2));
+ kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+ viennacl::traits::opencl_handle(input.g_b),
+ static_cast<cl_uint>(mat_size),
+ static_cast<cl_uint>(num_one_intervals),
+ viennacl::traits::opencl_handle(result.g_left_one),
+ viennacl::traits::opencl_handle(result.g_right_one),
+ viennacl::traits::opencl_handle(result.g_pos_one),
+ static_cast<NumericT>(precision)
+ ));
+ }
+
+
+template<typename NumericT>
+void bisectLargeMultIntervals(const viennacl::linalg::detail::InputData<NumericT> &input,
+ viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT precision)
+ {
+ unsigned int num_blocks_mult = result.g_num_blocks_mult;
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+ viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE_MULT_INTERVALS);
+ kernel.global_work_size(0, num_blocks_mult * (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2));
+ kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+ viennacl::traits::opencl_handle(input.g_b),
+ static_cast<cl_uint>(mat_size),
+ viennacl::traits::opencl_handle(result.g_blocks_mult),
+ viennacl::traits::opencl_handle(result.g_blocks_mult_sum),
+ viennacl::traits::opencl_handle(result.g_left_mult),
+ viennacl::traits::opencl_handle(result.g_right_mult),
+ viennacl::traits::opencl_handle(result.g_left_count_mult),
+ viennacl::traits::opencl_handle(result.g_right_count_mult),
+ viennacl::traits::opencl_handle(result.g_lambda_mult),
+ viennacl::traits::opencl_handle(result.g_pos_mult),
+ static_cast<NumericT>(precision)
+ ));
+ }
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
new file mode 100644
index 0000000..d6a288b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
@@ -0,0 +1,102 @@
+#ifndef VIENNACL_LINALG_OPENCL_COMMON_HPP_
+#define VIENNACL_LINALG_OPENCL_COMMON_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/common.hpp
+ @brief Common implementations shared by OpenCL-based operations
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace detail
+{
+
+
+
+inline cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
+{
+ return static_cast<cl_uint>( ((length > 1) ? (cl_uint(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
+}
+
+
+/** @brief Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices. */
+inline std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
+{
+ if (B_transposed)
+ {
+ if (B_row_major && C_row_major)
+ return "trans_mat_mult_row_row";
+ if (B_row_major && !C_row_major)
+ return "trans_mat_mult_row_col";
+ if (!B_row_major && C_row_major)
+ return "trans_mat_mult_col_row";
+
+ return "trans_mat_mult_col_col";
+ }
+
+ if (B_row_major && C_row_major)
+ return "mat_mult_row_row";
+ if (B_row_major && !C_row_major)
+ return "mat_mult_row_col";
+ if (!B_row_major && C_row_major)
+ return "mat_mult_col_row";
+
+ return "mat_mult_col_col";
+}
+
+
+
+template<typename SomeT>
+ocl::device const & current_device(SomeT const & obj) { return traits::opencl_handle(obj).context().current_device(); }
+
+inline std::string op_to_string(op_abs) { return "abs"; }
+inline std::string op_to_string(op_acos) { return "acos"; }
+inline std::string op_to_string(op_asin) { return "asin"; }
+inline std::string op_to_string(op_atan) { return "atan"; }
+inline std::string op_to_string(op_ceil) { return "ceil"; }
+inline std::string op_to_string(op_cos) { return "cos"; }
+inline std::string op_to_string(op_cosh) { return "cosh"; }
+inline std::string op_to_string(op_exp) { return "exp"; }
+inline std::string op_to_string(op_fabs) { return "fabs"; }
+inline std::string op_to_string(op_floor) { return "floor"; }
+inline std::string op_to_string(op_log) { return "log"; }
+inline std::string op_to_string(op_log10) { return "log10"; }
+inline std::string op_to_string(op_sin) { return "sin"; }
+inline std::string op_to_string(op_sinh) { return "sinh"; }
+inline std::string op_to_string(op_sqrt) { return "sqrt"; }
+inline std::string op_to_string(op_tan) { return "tan"; }
+inline std::string op_to_string(op_tanh) { return "tanh"; }
+
+} //namespace detail
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
new file mode 100644
index 0000000..76874b1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
@@ -0,0 +1,153 @@
+#ifndef VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/direct_solve.hpp
+ @brief Implementations of dense direct solvers are found here.
+*/
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix_solve.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+namespace detail
+{
+ inline cl_uint get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; }
+ inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
+ inline cl_uint get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); }
+ inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
+
+ template<typename MatrixT1, typename MatrixT2, typename KernelT>
+ void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, KernelT & k)
+ {
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+ cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
+ cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
+ cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+ viennacl::traits::opencl_handle(B),
+ cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)),
+ cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)),
+ cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)),
+ cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B))
+ )
+ );
+ }
+}
+
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+//
+
+////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
+/** @brief Direct inplace solver for dense triangular systems. Matlab notation: A \ B
+*
+* @param A The system matrix
+* @param B The matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+ matrix_base<NumericT> & B,
+ SolverTagT)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+ std::string program_name;
+ if (A.row_major() && B.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, row_major, row_major> KernelClass;
+ KernelClass::init(ctx);
+ program_name = KernelClass::program_name();
+ }
+ else if (A.row_major() && !B.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, row_major, column_major> KernelClass;
+ KernelClass::init(ctx);
+ program_name = KernelClass::program_name();
+ }
+ else if (!A.row_major() && B.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, column_major, row_major> KernelClass;
+ KernelClass::init(ctx);
+ program_name = KernelClass::program_name();
+ }
+ else
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, column_major, column_major> KernelClass;
+ KernelClass::init(ctx);
+ program_name = KernelClass::program_name();
+ }
+
+ std::stringstream ss;
+ ss << SolverTagT::name();
+ ss << "_solve";
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(program_name, ss.str());
+
+ k.global_work_size(0, B.size2() * k.local_work_size());
+ detail::inplace_solve_impl(A, B, k);
+}
+
+
+
+//
+// Solve on vector
+//
+
+template<typename NumericT, typename SOLVERTAG>
+void inplace_solve(matrix_base<NumericT> const & A,
+ vector_base<NumericT> & x,
+ SOLVERTAG)
+{
+ cl_uint options = detail::get_option_for_solver_tag(SOLVERTAG());
+
+ viennacl::ocl::kernel & k = detail::legacy_kernel_for_matrix(A, "triangular_substitute_inplace");
+
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+ cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
+ cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
+ cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(viennacl::traits::start(x)),
+ cl_uint(viennacl::traits::stride(x)),
+ cl_uint(viennacl::traits::size(x)),
+ options
+ )
+ );
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
new file mode 100644
index 0000000..a7b12b3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
@@ -0,0 +1,350 @@
+#ifndef VIENNACL_LINALG_OPENCL_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/opencl/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation using OpenCL
+ */
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include <cmath>
+#include <stdexcept>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace fft
+{
+
+ const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+ /**
+ * @brief Get number of bits
+ */
+ inline vcl_size_t num_bits(vcl_size_t size)
+ {
+ vcl_size_t bits_datasize = 0;
+ vcl_size_t ds = 1;
+
+ while (ds < size)
+ {
+ ds = ds << 1;
+ bits_datasize++;
+ }
+
+ return bits_datasize;
+ }
+
+ /**
+ * @brief Find next power of two
+ */
+ inline vcl_size_t next_power_2(vcl_size_t n)
+ {
+ n = n - 1;
+
+ vcl_size_t power = 1;
+
+ while (power < sizeof(vcl_size_t) * 8)
+ {
+ n = n | (n >> power);
+ power *= 2;
+ }
+
+ return n + 1;
+ }
+
+} //namespce fft
+} //namespace detail
+
+namespace opencl
+{
+
+/**
+ * @brief Direct algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT>
+void direct(viennacl::ocl::handle<cl_mem> const & in,
+ viennacl::ocl::handle<cl_mem> const & out,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+ if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+ {
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+ program_string =
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+ } else
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_direct");
+ viennacl::ocl::enqueue(k(in, out,
+ static_cast<cl_uint>(size),
+ static_cast<cl_uint>(stride),
+ static_cast<cl_uint>(batch_num),
+ sign)
+ );
+}
+
+/*
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT>
+void reorder(viennacl::ocl::handle<cl_mem> const & in,
+ vcl_size_t size, vcl_size_t stride,
+ vcl_size_t bits_datasize, vcl_size_t batch_num,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+ if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+ {
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+ program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+ } else
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(program_string, "fft_reorder");
+ viennacl::ocl::enqueue(k(in,
+ static_cast<cl_uint>(bits_datasize), static_cast<cl_uint>(size),
+ static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num))
+ );
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT>
+void radix2(viennacl::ocl::handle<cl_mem> const & in,
+ vcl_size_t size, vcl_size_t stride,
+ vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ assert(batch_num != 0 && bool("batch_num must be larger than 0"));
+
+ std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+ if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+ {
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+ program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+ } else
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+ vcl_size_t bits_datasize = viennacl::linalg::detail::fft::num_bits(size);
+ if (size <= viennacl::linalg::detail::fft::MAX_LOCAL_POINTS_NUM)
+ {
+ viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_radix2_local");
+ viennacl::ocl::enqueue(k(in,
+ viennacl::ocl::local_mem((size * 4) * sizeof(NumericT)),
+ static_cast<cl_uint>(bits_datasize), static_cast<cl_uint>(size),
+ static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num), sign));
+
+ }
+ else
+ {
+ viennacl::linalg::opencl::reorder<NumericT>(in, size, stride, bits_datasize, batch_num);
+
+ for (vcl_size_t step = 0; step < bits_datasize; step++)
+ {
+ viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_radix2");
+ viennacl::ocl::enqueue(k(in,
+ static_cast<cl_uint>(step), static_cast<cl_uint>(bits_datasize),
+ static_cast<cl_uint>(size), static_cast<cl_uint>(stride),
+ static_cast<cl_uint>(batch_num), sign));
+ }
+ }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently, Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV>& in,
+ viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ vcl_size_t size = in.size() >> 1;
+ vcl_size_t ext_size = viennacl::linalg::detail::fft::next_power_2(2 * size - 1);
+
+ viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+ {
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "zero2");
+ viennacl::ocl::enqueue(k(A, B, static_cast<cl_uint>(ext_size)));
+ }
+ {
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "bluestein_pre");
+ viennacl::ocl::enqueue(k(in, A, B, static_cast<cl_uint>(size), static_cast<cl_uint>(ext_size)));
+ }
+
+ viennacl::linalg::convolve_i(A, B, Z);
+
+ {
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "bluestein_post");
+ viennacl::ocl::enqueue(k(Z, out, static_cast<cl_uint>(size)));
+ }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+ viennacl::vector<NumericT, AlignmentV> const & input2,
+ viennacl::vector<NumericT, AlignmentV> & output)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input1).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+ vcl_size_t size = input1.size() >> 1;
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "fft_mult_vec");
+ viennacl::ocl::enqueue(k(input1, input2, output, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "fft_div_vec_scalar");
+
+ vcl_size_t size = input.size() >> 1;
+ NumericT norm_factor = static_cast<NumericT>(size);
+ viennacl::ocl::enqueue(k(input, static_cast<cl_uint>(size), norm_factor));
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "transpose_inplace");
+ viennacl::ocl::enqueue(k(input, static_cast<cl_uint>(input.internal_size1() >> 1),
+ static_cast<cl_uint>(input.internal_size2()) >> 1));
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "transpose");
+ viennacl::ocl::enqueue(k(input, output, static_cast<cl_uint>(input.internal_size1() >> 1),
+ static_cast<cl_uint>(input.internal_size2() >> 1)));
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "real_to_complex");
+ viennacl::ocl::enqueue(k(in, out, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "complex_to_real");
+ viennacl::ocl::enqueue(k(in, out, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT>& in)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ vcl_size_t size = in.size();
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "reverse_inplace");
+ viennacl::ocl::enqueue(k(in, static_cast<cl_uint>(size)));
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
new file mode 100644
index 0000000..248a88a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
@@ -0,0 +1,260 @@
+#ifndef VIENNACL_LINALG_OPENCL_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/ilu_operations.hpp
+ @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using OpenCL
+*/
+
+#include <cmath>
+#include <algorithm> //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/ilu.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+/////////////////////// ICC /////////////////////
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ //
+ // Step 1: Count elements in L:
+ //
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_L_1");
+
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+ L.handle1().opencl_handle())
+ );
+
+ //
+ // Step 2: Exclusive scan on row_buffers:
+ //
+ viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), A.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+
+ //
+ // Step 3: Write entries
+ //
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_L_2");
+
+ viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
+ L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle())
+ );
+
+ L.generate_row_block_information();
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ // fill D:
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_1");
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+ // scale L:
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_2");
+ viennacl::ocl::enqueue(k2(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+}
+
+/////////////////////////////////////
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenCL (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ viennacl::backend::mem_handle L_backup;
+ viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+ viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "icc_chow_patel_sweep_kernel");
+ viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), L_backup.opencl_handle(), cl_uint(L.size1()),
+ aij_L)
+ );
+
+}
+
+
+/////////////////////// ILU /////////////////////
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ //
+ // Step 1: Count elements in L and U:
+ //
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_LU_1");
+
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+ L.handle1().opencl_handle(),
+ U.handle1().opencl_handle())
+ );
+
+ //
+ // Step 2: Exclusive scan on row_buffers:
+ //
+ viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), A.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+ viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), A.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer);
+ U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+ //
+ // Step 3: Write entries
+ //
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_LU_2");
+
+ viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
+ L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+ U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle())
+ );
+
+ L.generate_row_block_information();
+ // Note: block information for U will be generated after transposition
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ // fill D:
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_1");
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+ // scale L:
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_2");
+ viennacl::ocl::enqueue(k2(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+ // scale U:
+ viennacl::ocl::enqueue(k2(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+}
+
+/////////////////////////////////////
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenCL (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L,
+ compressed_matrix<NumericT> & U_trans,
+ vector<NumericT> const & aij_U_trans)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ viennacl::backend::mem_handle L_backup;
+ viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+ viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+ viennacl::backend::mem_handle U_backup;
+ viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
+ viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_chow_patel_sweep_kernel");
+ viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), L_backup.opencl_handle(), cl_uint(L.size1()),
+ aij_L,
+ U_trans.handle1().opencl_handle(), U_trans.handle2().opencl_handle(), U_trans.handle().opencl_handle(), U_backup.opencl_handle(),
+ aij_U_trans)
+ );
+
+}
+
+//////////////////////////////////////
+
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+ vector<NumericT> & diag_R)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(R).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_form_neumann_matrix_kernel");
+ viennacl::ocl::enqueue(k(R.handle1().opencl_handle(), R.handle2().opencl_handle(), R.handle().opencl_handle(), cl_uint(R.size1()),
+ diag_R)
+ );
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[48/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cu
new file mode 100644
index 0000000..20c4994
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas2_opencl.cu
@@ -0,0 +1,219 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+// xGEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, float alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ float beta,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemv(ViennaCLBackend backend,
+ ViennaCLOrder order, ViennaCLTranspose transA,
+ ViennaCLInt m, ViennaCLInt n, double alpha, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ double beta,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ v2 *= beta;
+ if (transA == ViennaCLTrans)
+ v2 += alpha * viennacl::linalg::prod(viennacl::trans(mat), v1);
+ else
+ v2 += alpha * viennacl::linalg::prod(mat, v1);
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLStrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDtrsv(ViennaCLBackend backend,
+ ViennaCLUplo uplo, ViennaCLOrder order, ViennaCLTranspose transA, ViennaCLDiag diag,
+ ViennaCLInt n, cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(n), size_type(offA_row), difference_type(incA_row), size_type(n),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+ if (transA == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(viennacl::trans(mat), v, viennacl::linalg::lower_tag());
+ }
+ else
+ {
+ if (uplo == ViennaCLUpper)
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_upper_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::upper_tag());
+ else
+ if (diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::unit_lower_tag());
+ else
+ viennacl::linalg::inplace_solve(mat, v, viennacl::linalg::lower_tag());
+ }
+
+ return ViennaCLSuccess;
+}
+
+
+
+// xGER
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<float> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDger(ViennaCLBackend backend,
+ ViennaCLOrder order,
+ ViennaCLInt m, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(m), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::matrix_base<double> mat(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offA_row), difference_type(incA_row), size_type(m),
+ size_type(n), size_type(offA_col), difference_type(incA_col), size_type(lda), order == ViennaCLRowMajor);
+
+ mat += alpha * viennacl::linalg::outer_prod(v1, v2);
+
+ return ViennaCLSuccess;
+}
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cpp
new file mode 100644
index 0000000..bb6e03e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cpp
@@ -0,0 +1,272 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C)
+{
+ viennacl::backend::mem_handle A_handle;
+ viennacl::backend::mem_handle B_handle;
+ viennacl::backend::mem_handle C_handle;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(B_handle, B) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(C_handle, C) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (A->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::matrix_base<float>::size_type size_type;
+ typedef viennacl::matrix_base<float>::size_type difference_type;
+
+ viennacl::matrix_base<float> mat_A(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ viennacl::matrix_base<float> mat_B(B_handle,
+ size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+ size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+ viennacl::matrix_base<float> mat_C(C_handle,
+ size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
+ size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);
+
+ if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+ else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+ viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::matrix_base<double>::size_type size_type;
+ typedef viennacl::matrix_base<double>::size_type difference_type;
+
+ viennacl::matrix_base<double> mat_A(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ viennacl::matrix_base<double> mat_B(B_handle,
+ size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+ size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+ viennacl::matrix_base<double> mat_C(C_handle,
+ size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
+ size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);
+
+ if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+ else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+ viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B)
+{
+ viennacl::backend::mem_handle A_handle;
+ viennacl::backend::mem_handle B_handle;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(B_handle, B) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (A->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::matrix_base<float>::size_type size_type;
+ typedef viennacl::matrix_base<float>::size_type difference_type;
+
+ viennacl::matrix_base<float> mat_A(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ viennacl::matrix_base<float> mat_B(B_handle,
+ size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+ size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+
+ if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+
+ return ViennaCLSuccess;
+ }
+ case ViennaCLDouble:
+ {
+ typedef viennacl::matrix_base<double>::size_type size_type;
+ typedef viennacl::matrix_base<double>::size_type difference_type;
+
+ viennacl::matrix_base<double> mat_A(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ viennacl::matrix_base<double> mat_B(B_handle,
+ size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+ size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+
+ if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cu
new file mode 100644
index 0000000..bb6e03e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.cu
@@ -0,0 +1,272 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_matrix.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// GEMV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C)
+{
+ viennacl::backend::mem_handle A_handle;
+ viennacl::backend::mem_handle B_handle;
+ viennacl::backend::mem_handle C_handle;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(B_handle, B) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(C_handle, C) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (A->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::matrix_base<float>::size_type size_type;
+ typedef viennacl::matrix_base<float>::size_type difference_type;
+
+ viennacl::matrix_base<float> mat_A(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ viennacl::matrix_base<float> mat_B(B_handle,
+ size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+ size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+ viennacl::matrix_base<float> mat_C(C_handle,
+ size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
+ size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);
+
+ if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+ else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+ viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::matrix_base<double>::size_type size_type;
+ typedef viennacl::matrix_base<double>::size_type difference_type;
+
+ viennacl::matrix_base<double> mat_A(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ viennacl::matrix_base<double> mat_B(B_handle,
+ size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+ size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+ viennacl::matrix_base<double> mat_C(C_handle,
+ size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
+ size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);
+
+ if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+ else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+ viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
+ else
+ return ViennaCLGenericFailure;
+
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xTRSV
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B)
+{
+ viennacl::backend::mem_handle A_handle;
+ viennacl::backend::mem_handle B_handle;
+
+ if (init_matrix(A_handle, A) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_matrix(B_handle, B) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (A->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::matrix_base<float>::size_type size_type;
+ typedef viennacl::matrix_base<float>::size_type difference_type;
+
+ viennacl::matrix_base<float> mat_A(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ viennacl::matrix_base<float> mat_B(B_handle,
+ size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+ size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+
+ if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+
+ return ViennaCLSuccess;
+ }
+ case ViennaCLDouble:
+ {
+ typedef viennacl::matrix_base<double>::size_type size_type;
+ typedef viennacl::matrix_base<double>::size_type difference_type;
+
+ viennacl::matrix_base<double> mat_A(A_handle,
+ size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
+ size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
+ viennacl::matrix_base<double> mat_B(B_handle,
+ size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
+ size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
+
+ if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+ else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
+ {
+ if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
+ else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
+ else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
+ viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
+ else
+ return ViennaCLGenericFailure;
+ }
+
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.hpp
new file mode 100644
index 0000000..cfcc034
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3.hpp
@@ -0,0 +1,60 @@
+#ifndef VIENNACL_SRC_BLAS3_HPP
+#define VIENNACL_SRC_BLAS3_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+namespace detail
+{
+ template <typename ScalarType, typename MatrixTypeA, typename MatrixTypeB, typename MatrixTypeC>
+ void gemm_dispatch(ScalarType alpha,
+ MatrixTypeA const & A, ViennaCLTranspose transA,
+ MatrixTypeB const & B, ViennaCLTranspose transB,
+ ScalarType beta,
+ MatrixTypeC & C)
+ {
+
+ if (transA == ViennaCLTrans && transB == ViennaCLTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(A), viennacl::trans(B), C, alpha, beta);
+ else if (transA == ViennaCLTrans && transB == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(viennacl::trans(A), B, C, alpha, beta);
+ else if (transA == ViennaCLNoTrans && transB == ViennaCLTrans)
+ viennacl::linalg::prod_impl(A, viennacl::trans(B), C, alpha, beta);
+ else if (transA == ViennaCLNoTrans && transB == ViennaCLNoTrans)
+ viennacl::linalg::prod_impl(A, B, C, alpha, beta);
+ //else
+ // return ViennaCLGenericFailure;
+ }
+}
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_cuda.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_cuda.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_cuda.cu
new file mode 100644
index 0000000..318593b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_cuda.cu
@@ -0,0 +1,133 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+ template <typename NumericT>
+ ViennaCLStatus ViennaCLCUDAgemm_impl(ViennaCLBackend /*backend*/,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ NumericT alpha,
+ NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ NumericT beta,
+ NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+ {
+ ViennaCLInt A_size1 = (transA == ViennaCLTrans) ? k : m;
+ ViennaCLInt A_size2 = (transA == ViennaCLTrans) ? m : k;
+
+ ViennaCLInt B_size1 = (transB == ViennaCLTrans) ? n : k;
+ ViennaCLInt B_size2 = (transB == ViennaCLTrans) ? k : n;
+
+ bool A_row_major = (orderA == ViennaCLRowMajor);
+ bool B_row_major = (orderB == ViennaCLRowMajor);
+ bool C_row_major = (orderC == ViennaCLRowMajor);
+
+ viennacl::matrix_base<NumericT> matA(A, viennacl::CUDA_MEMORY,
+ A_size1, offA_row, incA_row, A_row_major ? m : lda,
+ A_size2, offA_col, incA_col, A_row_major ? lda : k, A_row_major);
+
+ viennacl::matrix_base<NumericT> matB(B, viennacl::CUDA_MEMORY,
+ B_size1, offB_row, incB_row, B_row_major ? k : ldb,
+ B_size2, offB_col, incB_col, B_row_major ? ldb : n, B_row_major);
+
+ viennacl::matrix_base<NumericT> matC(C, viennacl::CUDA_MEMORY,
+ m, offC_row, incC_row, C_row_major ? m : ldc,
+ n, offC_col, incC_col, C_row_major ? ldc : n, C_row_major);
+
+ detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+ return ViennaCLSuccess;
+ }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ float alpha,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ float beta,
+ float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLCUDAgemm_impl<float>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ double alpha,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ double beta,
+ double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLCUDAgemm_impl<double>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cpp
new file mode 100644
index 0000000..16ef310
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cpp
@@ -0,0 +1,131 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+ template <typename NumericT>
+ ViennaCLStatus ViennaCLHostgemm_impl(ViennaCLBackend /*backend*/,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ NumericT alpha,
+ NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ NumericT beta,
+ NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+ {
+ typedef typename viennacl::matrix_base<NumericT>::size_type size_type;
+ typedef typename viennacl::matrix_base<NumericT>::size_type difference_type;
+
+ size_type A_size1 = static_cast<size_type>((transA == ViennaCLTrans) ? k : m);
+ size_type A_size2 = static_cast<size_type>((transA == ViennaCLTrans) ? m : k);
+
+ size_type B_size1 = static_cast<size_type>((transB == ViennaCLTrans) ? n : k);
+ size_type B_size2 = static_cast<size_type>((transB == ViennaCLTrans) ? k : n);
+
+ bool A_row_major = (orderA == ViennaCLRowMajor);
+ bool B_row_major = (orderB == ViennaCLRowMajor);
+ bool C_row_major = (orderC == ViennaCLRowMajor);
+
+ viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+ A_size1, size_type(offA_row), difference_type(incA_row), size_type(A_row_major ? m : lda),
+ A_size2, size_type(offA_col), difference_type(incA_col), size_type(A_row_major ? lda : k), A_row_major);
+
+ viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+ B_size1, size_type(offB_row), difference_type(incB_row), size_type(B_row_major ? k : ldb),
+ B_size2, size_type(offB_col), difference_type(incB_col), size_type(B_row_major ? ldb : n), B_row_major);
+
+ viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offC_row), difference_type(incC_row), size_type(C_row_major ? m : ldc),
+ size_type(n), size_type(offC_col), difference_type(incC_col), size_type(C_row_major ? ldc : n), C_row_major);
+
+ detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+ return ViennaCLSuccess;
+ }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ float alpha,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ float beta,
+ float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLHostgemm_impl<float>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ double alpha,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ double beta,
+ double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLHostgemm_impl<double>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cu
new file mode 100644
index 0000000..16ef310
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_host.cu
@@ -0,0 +1,131 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+ template <typename NumericT>
+ ViennaCLStatus ViennaCLHostgemm_impl(ViennaCLBackend /*backend*/,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ NumericT alpha,
+ NumericT *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ NumericT *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ NumericT beta,
+ NumericT *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+ {
+ typedef typename viennacl::matrix_base<NumericT>::size_type size_type;
+ typedef typename viennacl::matrix_base<NumericT>::size_type difference_type;
+
+ size_type A_size1 = static_cast<size_type>((transA == ViennaCLTrans) ? k : m);
+ size_type A_size2 = static_cast<size_type>((transA == ViennaCLTrans) ? m : k);
+
+ size_type B_size1 = static_cast<size_type>((transB == ViennaCLTrans) ? n : k);
+ size_type B_size2 = static_cast<size_type>((transB == ViennaCLTrans) ? k : n);
+
+ bool A_row_major = (orderA == ViennaCLRowMajor);
+ bool B_row_major = (orderB == ViennaCLRowMajor);
+ bool C_row_major = (orderC == ViennaCLRowMajor);
+
+ viennacl::matrix_base<NumericT> matA(A, viennacl::MAIN_MEMORY,
+ A_size1, size_type(offA_row), difference_type(incA_row), size_type(A_row_major ? m : lda),
+ A_size2, size_type(offA_col), difference_type(incA_col), size_type(A_row_major ? lda : k), A_row_major);
+
+ viennacl::matrix_base<NumericT> matB(B, viennacl::MAIN_MEMORY,
+ B_size1, size_type(offB_row), difference_type(incB_row), size_type(B_row_major ? k : ldb),
+ B_size2, size_type(offB_col), difference_type(incB_col), size_type(B_row_major ? ldb : n), B_row_major);
+
+ viennacl::matrix_base<NumericT> matC(C, viennacl::MAIN_MEMORY,
+ size_type(m), size_type(offC_row), difference_type(incC_row), size_type(C_row_major ? m : ldc),
+ size_type(n), size_type(offC_col), difference_type(incC_col), size_type(C_row_major ? ldc : n), C_row_major);
+
+ detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+ return ViennaCLSuccess;
+ }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ float alpha,
+ float *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ float *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ float beta,
+ float *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLHostgemm_impl<float>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ double alpha,
+ double *A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ double *B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ double beta,
+ double *C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLHostgemm_impl<double>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cpp
new file mode 100644
index 0000000..d5e5c1e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cpp
@@ -0,0 +1,136 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_OPENCL
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+ template <typename NumericT>
+ ViennaCLStatus ViennaCLOpenCLgemm_impl(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ NumericT alpha,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ NumericT beta,
+ cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+ {
+ typedef typename viennacl::matrix_base<NumericT>::size_type size_type;
+ typedef typename viennacl::matrix_base<NumericT>::size_type difference_type;
+
+ size_type A_size1 = static_cast<size_type>((transA == ViennaCLTrans) ? k : m);
+ size_type A_size2 = static_cast<size_type>((transA == ViennaCLTrans) ? m : k);
+
+ size_type B_size1 = static_cast<size_type>((transB == ViennaCLTrans) ? n : k);
+ size_type B_size2 = static_cast<size_type>((transB == ViennaCLTrans) ? k : n);
+
+ bool A_row_major = (orderA == ViennaCLRowMajor);
+ bool B_row_major = (orderB == ViennaCLRowMajor);
+ bool C_row_major = (orderC == ViennaCLRowMajor);
+
+ viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ A_size1, size_type(offA_row), difference_type(incA_row), size_type(A_row_major ? m : lda),
+ A_size2, size_type(offA_col), difference_type(incA_col), size_type(A_row_major ? lda : k), A_row_major);
+
+ viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ B_size1, size_type(offB_row), difference_type(incB_row), size_type(B_row_major ? k : ldb),
+ B_size2, size_type(offB_col), difference_type(incB_col), size_type(B_row_major ? ldb : n), B_row_major);
+
+ viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offC_row), difference_type(incC_row), size_type(C_row_major ? m : ldc),
+ size_type(n), size_type(offC_col), difference_type(incC_col), size_type(C_row_major ? ldc : n), C_row_major);
+
+ detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+ return ViennaCLSuccess;
+ }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ float alpha,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ float beta,
+ cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLOpenCLgemm_impl<float>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ double alpha,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ double beta,
+ cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLOpenCLgemm_impl<double>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cu
new file mode 100644
index 0000000..d5e5c1e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas3_opencl.cu
@@ -0,0 +1,136 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "blas3.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+
+#ifdef VIENNACL_WITH_OPENCL
+
+
+
+//
+// xGEMV
+//
+
+namespace detail
+{
+ template <typename NumericT>
+ ViennaCLStatus ViennaCLOpenCLgemm_impl(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ NumericT alpha,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ NumericT beta,
+ cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+ {
+ typedef typename viennacl::matrix_base<NumericT>::size_type size_type;
+ typedef typename viennacl::matrix_base<NumericT>::size_type difference_type;
+
+ size_type A_size1 = static_cast<size_type>((transA == ViennaCLTrans) ? k : m);
+ size_type A_size2 = static_cast<size_type>((transA == ViennaCLTrans) ? m : k);
+
+ size_type B_size1 = static_cast<size_type>((transB == ViennaCLTrans) ? n : k);
+ size_type B_size2 = static_cast<size_type>((transB == ViennaCLTrans) ? k : n);
+
+ bool A_row_major = (orderA == ViennaCLRowMajor);
+ bool B_row_major = (orderB == ViennaCLRowMajor);
+ bool C_row_major = (orderC == ViennaCLRowMajor);
+
+ viennacl::matrix_base<NumericT> matA(A, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ A_size1, size_type(offA_row), difference_type(incA_row), size_type(A_row_major ? m : lda),
+ A_size2, size_type(offA_col), difference_type(incA_col), size_type(A_row_major ? lda : k), A_row_major);
+
+ viennacl::matrix_base<NumericT> matB(B, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ B_size1, size_type(offB_row), difference_type(incB_row), size_type(B_row_major ? k : ldb),
+ B_size2, size_type(offB_col), difference_type(incB_col), size_type(B_row_major ? ldb : n), B_row_major);
+
+ viennacl::matrix_base<NumericT> matC(C, viennacl::ocl::get_context(backend->opencl_backend.context_id),
+ size_type(m), size_type(offC_row), difference_type(incC_row), size_type(C_row_major ? m : ldc),
+ size_type(n), size_type(offC_col), difference_type(incC_col), size_type(C_row_major ? ldc : n), C_row_major);
+
+ detail::gemm_dispatch(alpha, matA, transA, matB, transB, beta, matC);
+
+ return ViennaCLSuccess;
+ }
+
+}
+
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ float alpha,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ float beta,
+ cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLOpenCLgemm_impl<float>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDgemm(ViennaCLBackend backend,
+ ViennaCLOrder orderA, ViennaCLTranspose transA,
+ ViennaCLOrder orderB, ViennaCLTranspose transB,
+ ViennaCLOrder orderC,
+ ViennaCLInt m, ViennaCLInt n, ViennaCLInt k,
+ double alpha,
+ cl_mem A, ViennaCLInt offA_row, ViennaCLInt offA_col, ViennaCLInt incA_row, ViennaCLInt incA_col, ViennaCLInt lda,
+ cl_mem B, ViennaCLInt offB_row, ViennaCLInt offB_col, ViennaCLInt incB_row, ViennaCLInt incB_col, ViennaCLInt ldb,
+ double beta,
+ cl_mem C, ViennaCLInt offC_row, ViennaCLInt offC_col, ViennaCLInt incC_row, ViennaCLInt incC_col, ViennaCLInt ldc)
+{
+ return detail::ViennaCLOpenCLgemm_impl<double>(backend,
+ orderA, transA,
+ orderB, transB,
+ orderC,
+ m, n, k,
+ alpha,
+ A, offA_row, offA_col, incA_row, incA_col, lda,
+ B, offB_row, offB_col, incB_row, incB_col, ldb,
+ beta,
+ C, offC_row, offC_col, incC_row, incC_col, ldc);
+}
+
+
+#endif
[44/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp
new file mode 100644
index 0000000..aec9043
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_SOUTHERN_ISLANDS_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_SOUTHERN_ISLANDS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace southern_islands{
+namespace tahiti{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,32,16,8,1,1,16,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,16,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,4,64));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(2,8,2,16,4,2,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,16,4,4,4,2,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,4,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,8,32,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,8,32,32,4,1,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,64,32,4,4,2,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::southern_islands, "Tahiti", matrix_product_template::parameters_type(1,128,32,2,2,1,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,32,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp
new file mode 100644
index 0000000..c2674f0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_VOLCANIC_ISLANDS_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_AMD_VOLCANIC_ISLANDS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace amd{
+namespace volcanic_islands{
+namespace hawaii{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,8,16,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,8,16,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(2,8,8,8,6,1,6,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,16,16,16,2,1,8,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,1,2,64,8,2,4,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,16,16,16,2,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(2,16,16,16,6,1,6,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(amd_id, CL_DEVICE_TYPE_GPU, ocl::volcanic_islands, "Hawaii", matrix_product_template::parameters_type(1,64,64,4,2,4,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,64,4));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp
new file mode 100644
index 0000000..ff307f3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_FALLBACK_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_FALLBACK_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace fallback{
+
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(unknown_id, CL_DEVICE_TYPE_GPU, unknown, "", matrix_product_template::parameters_type(1,8,8,8,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,8,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp
new file mode 100644
index 0000000..24c02b7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp
@@ -0,0 +1,59 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GT540M_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GT540M_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace fermi{
+namespace geforce_gt_540m{
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GT 540M", matrix_product_template::parameters_type(1, 16, 16, 8, 4, 1, 8, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 16, 8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GT 540M", matrix_product_template::parameters_type(1, 16, 16, 16, 8, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 32, 8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GT 540M", matrix_product_template::parameters_type(1, 8, 16, 16, 8, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 16, 8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp
new file mode 100644
index 0000000..31a329b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp
@@ -0,0 +1,83 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GEFORCE_GTX_470_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GEFORCE_GTX_470_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace fermi{
+namespace geforce_gtx_470{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,2,32,32,4,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,2));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,8,16,8,2,2,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,4));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,128,32,1,2,1,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,16,32,4,4,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,2,16,64,8,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_CONTIGUOUS,16,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,32,32,16,2,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,8,16,32,8,2,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 470", matrix_product_template::parameters_type(1,16,32,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp
new file mode 100644
index 0000000..7015ea5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GTX580_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_GTX580_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace fermi{
+namespace geforce_gtx_580{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,2,1,128,4,1,4,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,16,128,32,2,4,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,4,64,128,4,1,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,128,32,1,1,1,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,32,4));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,2,32,32,8,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,2));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,16,32,16,4,4,2,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(2,16,16,16,4,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "GeForce GTX 580", matrix_product_template::parameters_type(1,128,16,2,4,1,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp
new file mode 100644
index 0000000..f430d6c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_TESLA_C2050_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_FERMI_TESLA_C2050_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace fermi{
+namespace tesla_c2050{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,4,32,32,8,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,16,8));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,32,32,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,4,128,64,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_CONTIGUOUS,16,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,32,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,32,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,16,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::fermi, "Tesla C2050", matrix_product_template::parameters_type(1,16,32,16,4,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp
new file mode 100644
index 0000000..73a62fc
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_KEPLER_K20M_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_KEPLER_K20M_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace kepler{
+namespace tesla_k20m{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,2,8,32,8,2,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,4,16));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,16,16,32,2,1,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,32));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,2,8,64,16,1,2,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,32,4));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,128,32,1,1,1,16,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_LOCAL,16,8));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,8,32,16,4,8,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,8,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,32,16,32,8,2,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,64));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(4,8,2,4,8,2,8,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::kepler, "Tesla K20m", matrix_product_template::parameters_type(1,128,64,1,4,2,16,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_LOCAL,16,8));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp
new file mode 100644
index 0000000..2c3f080
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_3_GEFORCE_GTX_750_TI_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_3_GEFORCE_GTX_750_TI_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace maxwell{
+namespace geforce_gtx_750_ti{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,2,8,128,1,1,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,8,32,32,2,1,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,8,32,1,8,2,FETCH_FROM_GLOBAL_CONTIGUOUS,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_8B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,8,32,1,2,2,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_STRIDED,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,4,32,16,8,2,4,FETCH_FROM_LOCAL,FETCH_FROM_GLOBAL_STRIDED,16,4));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,16,16,4,2,8,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,16,32,4,8,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,32,16));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::maxwell, "GeForce GTX 750 Ti", matrix_product_template::parameters_type(1,16,16,16,8,4,4,FETCH_FROM_LOCAL,FETCH_FROM_LOCAL,16,16));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp
new file mode 100644
index 0000000..88dd596
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp
@@ -0,0 +1,84 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_TESLA_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_DEVICES_GPU_NVIDIA_TESLA_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/templates/matrix_product_template.hpp"
+
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/builtin_database/common.hpp"
+
+namespace viennacl{
+namespace device_specific{
+namespace builtin_database{
+namespace devices{
+namespace gpu{
+namespace nvidia{
+namespace tesla{
+namespace geforce_gtx_260{
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,32,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,32,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,32,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_8B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,32,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,16,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'T'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,16,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'T'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,16,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+inline void add_4B(database_type<matrix_product_template::parameters_type> & db, char_to_type<'N'>, char_to_type<'N'>)
+{
+ db.add_4B(nvidia_id, CL_DEVICE_TYPE_GPU, ocl::tesla, "GeForce GTX 260", matrix_product_template::parameters_type(1,16,2,16,1,1,4,FETCH_FROM_GLOBAL_STRIDED,FETCH_FROM_GLOBAL_CONTIGUOUS,0,0));
+}
+
+
+}
+}
+}
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/matrix_product.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/matrix_product.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/matrix_product.hpp
new file mode 100644
index 0000000..4437956
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/builtin_database/matrix_product.hpp
@@ -0,0 +1,244 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_MATRIX_PRODUCT_HPP_
+#define VIENNACL_DEVICE_SPECIFIC_BUILTIN_DATABASE_MATRIX_PRODUCT_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/barts.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/tesla_c2050.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_470.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/maxwell/geforce_gtx_750_ti.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/scrapper.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/tesla/geforce_gtx_260.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/southern_islands/tahiti.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/northern_islands/devastator.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/kepler/tesla_k20m.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gtx_580.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/volcanic_islands/hawaii.hpp"
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cypress.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/amd/evergreen/cedar.hpp"
+
+
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp"
+
+
+#include "viennacl/ocl/device_utils.hpp"
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/device_specific/builtin_database/devices/accelerator/fallback.hpp"
+#include "viennacl/device_specific/builtin_database/devices/cpu/fallback.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/fallback.hpp"
+#include "viennacl/device_specific/builtin_database/devices/gpu/nvidia/fermi/geforce_gt_540m.hpp"
+
+/** @file viennacl/device_specific/builtin_database/matrix_product.hpp
+*
+* Initializes the device database with the provided profiles. Updated semi-automatically.
+*/
+
+namespace viennacl
+{
+namespace device_specific
+{
+namespace builtin_database
+{
+
+inline database_type<matrix_product_template::parameters_type> init_matrix_product_N_N()
+{
+ database_type<matrix_product_template::parameters_type> result;
+
+ devices::accelerator::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::accelerator::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+
+ devices::cpu::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::cpu::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+
+ devices::gpu::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+
+ devices::gpu::amd::evergreen::cedar::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::amd::evergreen::cypress::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::amd::volcanic_islands::hawaii::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::amd::volcanic_islands::hawaii::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_580::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_580::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::kepler::tesla_k20m::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::kepler::tesla_k20m::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::amd::southern_islands::tahiti::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::amd::southern_islands::tahiti::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::amd::northern_islands::devastator::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::tesla::geforce_gtx_260::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::amd::northern_islands::scrapper::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_470::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_470::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::tesla_c2050::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::tesla_c2050::add_8B(result, char_to_type<'N'>(), char_to_type<'N'>());
+ devices::gpu::amd::northern_islands::barts::add_4B(result, char_to_type<'N'>(), char_to_type<'N'>());
+
+ return result;
+}
+
+inline database_type<matrix_product_template::parameters_type> init_matrix_product_T_N()
+{
+ database_type<matrix_product_template::parameters_type> result;
+
+ devices::accelerator::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::accelerator::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+
+ devices::cpu::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::cpu::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+
+ devices::gpu::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gt_540m::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::evergreen::cedar::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::evergreen::cypress::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::volcanic_islands::hawaii::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::volcanic_islands::hawaii::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_580::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_580::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::kepler::tesla_k20m::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::kepler::tesla_k20m::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::southern_islands::tahiti::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::southern_islands::tahiti::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::northern_islands::devastator::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::tesla::geforce_gtx_260::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::northern_islands::scrapper::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_470::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_470::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::tesla_c2050::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::nvidia::fermi::tesla_c2050::add_8B(result, char_to_type<'T'>(), char_to_type<'N'>());
+ devices::gpu::amd::northern_islands::barts::add_4B(result, char_to_type<'T'>(), char_to_type<'N'>());
+
+ return result;
+}
+
+inline database_type<matrix_product_template::parameters_type> init_matrix_product_N_T()
+{
+ database_type<matrix_product_template::parameters_type> result;
+
+ devices::accelerator::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::accelerator::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+ devices::cpu::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::cpu::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+ devices::gpu::fallback::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::fallback::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+ devices::gpu::nvidia::fermi::geforce_gt_540m::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::amd::evergreen::cedar::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::amd::evergreen::cypress::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::amd::volcanic_islands::hawaii::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::amd::volcanic_islands::hawaii::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_580::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_580::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::kepler::tesla_k20m::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::kepler::tesla_k20m::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::amd::southern_islands::tahiti::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::amd::southern_islands::tahiti::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::amd::northern_islands::devastator::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::tesla::geforce_gtx_260::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+
+ devices::gpu::amd::northern_islands::scrapper::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_470::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_470::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::tesla_c2050::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::tesla_c2050::add_8B(result, char_to_type<'N'>(), char_to_type<'T'>());
+ devices::gpu::amd::northern_islands::barts::add_4B(result, char_to_type<'N'>(), char_to_type<'T'>());
+
+ return result;
+}
+
+inline database_type<matrix_product_template::parameters_type> init_matrix_product_T_T()
+{
+ database_type<matrix_product_template::parameters_type> result;
+
+ devices::accelerator::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::accelerator::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+
+ devices::cpu::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::cpu::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+
+ devices::gpu::fallback::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::fallback::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::evergreen::cedar::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::evergreen::cypress::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::volcanic_islands::hawaii::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::volcanic_islands::hawaii::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_580::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_580::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::kepler::tesla_k20m::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::kepler::tesla_k20m::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::southern_islands::tahiti::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::southern_islands::tahiti::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::northern_islands::devastator::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::tesla::geforce_gtx_260::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::northern_islands::scrapper::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::maxwell::geforce_gtx_750_ti::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_470::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::geforce_gtx_470::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::tesla_c2050::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::nvidia::fermi::tesla_c2050::add_8B(result, char_to_type<'T'>(), char_to_type<'T'>());
+ devices::gpu::amd::northern_islands::barts::add_4B(result, char_to_type<'T'>(), char_to_type<'T'>());
+
+ return result;
+}
+
+static database_type<matrix_product_template::parameters_type> matrix_product_N_N = init_matrix_product_N_N();
+static database_type<matrix_product_template::parameters_type> matrix_product_T_N = init_matrix_product_T_N();
+static database_type<matrix_product_template::parameters_type> matrix_product_N_T = init_matrix_product_N_T();
+static database_type<matrix_product_template::parameters_type> matrix_product_T_T = init_matrix_product_T_T();
+
+template<class NumericT>
+matrix_product_template::parameters_type const & matrix_product_params(ocl::device const & device, char A_trans, char B_trans)
+{
+ assert(A_trans=='N' || A_trans=='T');
+ assert(B_trans=='N' || B_trans=='T');
+ database_type<matrix_product_template::parameters_type> * db;
+ if (A_trans=='N' && B_trans=='N')
+ db = &matrix_product_N_N;
+ else if (A_trans=='T' && B_trans=='N')
+ db = &matrix_product_T_N;
+ else if (A_trans=='N' && B_trans=='T')
+ db = &matrix_product_N_T;
+ else
+ db = &matrix_product_T_T;
+ return get_parameters<NumericT>(*db, device);
+}
+
+
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/execute.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/execute.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/execute.hpp
new file mode 100644
index 0000000..2f4960a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/execute.hpp
@@ -0,0 +1,55 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_EXECUTE_HPP
+#define VIENNACL_DEVICE_SPECIFIC_EXECUTE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/execute.hpp
+ @brief the user interface for the code generator
+*/
+
+#include <cstring>
+#include <vector>
+#include <typeinfo>
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/templates/template_base.hpp"
+#include "viennacl/device_specific/tree_parsing.hpp"
+#include "viennacl/device_specific/execution_handler.hpp"
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/tools/timer.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+inline void execute(template_base const & T, statements_container const & statements, viennacl::ocl::context & ctx = viennacl::ocl::current_context(), bool force_compilation = false)
+{
+ //Generate program name
+ std::string program_name = tree_parsing::statements_representation(statements, BIND_TO_HANDLE);
+ execution_handler handler(program_name, ctx, ctx.current_device(), force_compilation);
+ handler.add(program_name, T, statements);
+ handler.execute(program_name, statements);
+}
+
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/execution_handler.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/execution_handler.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/execution_handler.hpp
new file mode 100644
index 0000000..8f725fd
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/execution_handler.hpp
@@ -0,0 +1,102 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_EXECUTION_HANDLER_HPP
+#define VIENNACL_DEVICE_SPECIFIC_EXECUTION_HANDLER_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/execution_handler.hpp
+ @brief Helper for handling fallbacks, lazy compilation, input-dependent kernels, etc
+*/
+
+#include <map>
+
+#include "viennacl/tools/shared_ptr.hpp"
+
+#include "viennacl/device_specific/lazy_program_compiler.hpp"
+#include "viennacl/device_specific/templates/template_base.hpp"
+#include "viennacl/device_specific/utils.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+class execution_handler
+{
+public:
+ typedef std::map< std::string, tools::shared_ptr<template_base> > container_type;
+
+private:
+ std::string append_prefix(std::string const & str)
+ {
+ return "_" + str;
+ }
+
+ std::string define_extension(std::string const & ext)
+ {
+ // Note: On devices without double precision support, 'ext' is an empty string.
+ return (ext.length() > 1) ? std::string("#pragma OPENCL EXTENSION " + ext + " : enable\n") : std::string("\n");
+ }
+
+ void init_program_compiler(std::string const & name, bool force_recompilation)
+ {
+ lazy_programs_.push_back(lazy_program_compiler(&ctx_, name, force_recompilation));
+ lazy_programs_.back().add(define_extension(device_.double_support_extension()));
+ }
+
+public:
+ execution_handler(std::string const & program_name_base, viennacl::ocl::context & ctx, viennacl::ocl::device const & device, bool force_recompilation = false) : ctx_(ctx), device_(device), program_names_(2)
+ {
+ lazy_programs_.reserve(2);
+ init_program_compiler(program_name_base + "_0", force_recompilation);
+ init_program_compiler(program_name_base + "_1", force_recompilation);
+ }
+
+ void add(std::string const & key, template_base const & T, statements_container const & statements)
+ {
+ if (kernels_.insert(container_type::value_type(key, T.clone())).second)
+ {
+ std::vector<std::string> sources = at(kernels_, key)->generate(append_prefix(key), statements, device_);
+ assert(sources.size()<=2);
+ for (unsigned int i = 0; i < sources.size(); ++i)
+ lazy_programs_[i].add(sources[i]);
+ }
+ }
+
+ template_base * template_of(std::string const & key)
+ {
+ return at(kernels_, key).get();
+ }
+
+ void execute(container_type::key_type const & key, statements_container const & statements)
+ {
+ tools::shared_ptr<template_base> & template_pointer = at(kernels_, key);
+ template_pointer->enqueue(append_prefix(key), lazy_programs_, statements);
+ }
+
+private:
+ viennacl::ocl::context & ctx_;
+ viennacl::ocl::device const & device_;
+ container_type kernels_;
+ std::vector<std::string> program_names_;
+ std::vector<lazy_program_compiler> lazy_programs_;
+};
+
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/forwards.h
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/forwards.h b/native-viennaCL/src/main/cpp/viennacl/device_specific/forwards.h
new file mode 100644
index 0000000..590ed1f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/forwards.h
@@ -0,0 +1,294 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_FORWARDS_H
+#define VIENNACL_DEVICE_SPECIFIC_FORWARDS_H
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/forwards.h
+ @brief Forwards declaration
+*/
+
+#include <list>
+#include <map>
+#include <set>
+#include <stdexcept>
+
+#include "viennacl/scheduler/io.hpp"
+
+#include "viennacl/ocl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/backend/mem_handle.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+//Error codes
+static const int TEMPLATE_VALID = 0;
+static const int TEMPLATE_LOCAL_MEMORY_OVERFLOW = -1;
+static const int TEMPLATE_WORK_GROUP_SIZE_OVERFLOW = -2;
+static const int TEMPLATE_LOCAL_SIZE_0_OVERFLOW = -3;
+static const int TEMPLATE_LOCAL_SIZE_1_OVERFLOW = -4;
+static const int TEMPLATE_LOCAL_SIZE_2_OVERFLOW = -5;
+static const int TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE = -6;
+static const int TEMPLATE_INVALID_SIMD_WIDTH = -7;
+static const int TEMPLATE_INVALID_FETCHING_POLICY_TYPE= -9;
+
+static const int TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH = -10;
+static const int TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE = -11;
+static const int TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL = -12;
+static const int TEMPLATE_SIMD_WIDTH_MUST_BE_ONE = -13;
+static const int TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT = -14;
+static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE = -15;
+static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE = -16;
+static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE = -17;
+static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE = -18;
+
+struct index_tuple
+{
+ index_tuple(std::string const & _i, std::string const & _bound0) : i(_i), bound0(_bound0), j(""), bound1(""){ }
+ index_tuple(std::string const & _i, std::string const & _bound0, std::string const & _j, std::string const & _bound1) : i(_i), bound0(_bound0), j(_j), bound1(_bound1){ }
+ std::string i;
+ std::string bound0;
+ std::string j;
+ std::string bound1;
+};
+
+inline bool is_scalar_reduction(scheduler::statement_node const & node)
+{
+ return node.op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE || node.op.type_family==scheduler::OPERATION_VECTOR_REDUCTION_TYPE_FAMILY;
+}
+
+inline bool is_vector_reduction(scheduler::statement_node const & node)
+{
+ return node.op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
+ || node.op.type_family==scheduler::OPERATION_ROWS_REDUCTION_TYPE_FAMILY
+ || node.op.type_family==scheduler::OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY;
+}
+
+inline scheduler::statement_node const & lhs_most(scheduler::statement::container_type const & array, vcl_size_t root)
+{
+ scheduler::statement_node const * current = &array[root];
+ while (current->lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+ current = &array[current->lhs.node_index];
+ return *current;
+}
+
+enum expression_type
+{
+ SCALAR_AXPY_TYPE,
+ VECTOR_AXPY_TYPE,
+ MATRIX_AXPY_TYPE,
+ REDUCTION_TYPE,
+ ROW_WISE_REDUCTION_Nx_TYPE,
+ ROW_WISE_REDUCTION_Tx_TYPE,
+ MATRIX_PRODUCT_NN_TYPE,
+ MATRIX_PRODUCT_TN_TYPE,
+ MATRIX_PRODUCT_NT_TYPE,
+ MATRIX_PRODUCT_TT_TYPE,
+ INVALID_EXPRESSION_TYPE
+};
+
+inline const char * expression_type_to_string(expression_type type)
+{
+ switch (type)
+ {
+ case SCALAR_AXPY_TYPE : return "Scalar AXPY";
+ case VECTOR_AXPY_TYPE : return "Vector AXPY";
+ case MATRIX_AXPY_TYPE : return "Matrix AXPY";
+ case REDUCTION_TYPE : return "Reduction";
+ case ROW_WISE_REDUCTION_Nx_TYPE : return "Row-wise reduction: Ax";
+ case ROW_WISE_REDUCTION_Tx_TYPE : return "Row-wise reduction : Tx";
+ case MATRIX_PRODUCT_NN_TYPE : return "Matrix-Matrix Product : AA";
+ case MATRIX_PRODUCT_TN_TYPE : return "Matrix-Matrix Product : TA";
+ case MATRIX_PRODUCT_NT_TYPE : return "Matrix-Matrix Product : AT";
+ case MATRIX_PRODUCT_TT_TYPE : return "Matrix-Matrix Product : TT";
+ default : return "INVALID EXPRESSION";
+ }
+}
+
+/** @brief generate the string for a pointer kernel argument */
+static std::string generate_value_kernel_argument(std::string const & scalartype, std::string const & name)
+{
+ return scalartype + ' ' + name + ",";
+}
+
+/** @brief generate the string for a pointer kernel argument */
+static std::string generate_pointer_kernel_argument(std::string const & address_space, std::string const & scalartype, std::string const & name)
+{
+ return address_space + " " + scalartype + "* " + name + ",";
+}
+
+/** @brief Emulation of C++11's .at() member for std::map<>, const-version */
+template<typename KeyT, typename ValueT>
+ValueT const & at(std::map<KeyT, ValueT> const & map, KeyT const & key)
+{
+ typename std::map<KeyT, ValueT>::const_iterator it = map.find(key);
+ if (it != map.end())
+ return it->second;
+
+ throw std::out_of_range("Generator: Key not found in map");
+}
+
+/** @brief Emulation of C++11's .at() member for std::map<>, non-const version */
+template<typename KeyT, typename ValueT>
+ValueT & at(std::map<KeyT, ValueT> & map, KeyT const & key)
+{
+ typename std::map<KeyT, ValueT>::iterator it = map.find(key);
+ if (it != map.end())
+ return it->second;
+
+ throw std::out_of_range("Generator: Key not found in map");
+}
+
+/** @brief Exception for the case the generator is unable to deal with the operation */
+class generator_not_supported_exception : public std::exception
+{
+public:
+ generator_not_supported_exception() : message_() {}
+ generator_not_supported_exception(std::string message) : message_("ViennaCL: Internal error: The generator cannot handle the statement provided: " + message) {}
+ virtual const char* what() const throw() { return message_.c_str(); }
+ virtual ~generator_not_supported_exception() throw() {}
+private:
+ std::string message_;
+};
+
+namespace utils
+{
+ class kernel_generation_stream;
+}
+
+
+enum leaf_t
+{
+ LHS_NODE_TYPE,
+ PARENT_NODE_TYPE,
+ RHS_NODE_TYPE
+};
+
+class mapped_object;
+class template_base;
+
+typedef std::pair<vcl_size_t, leaf_t> mapping_key;
+typedef std::map<mapping_key, tools::shared_ptr<mapped_object> > mapping_type;
+
+
+namespace tree_parsing
+{
+
+ template<class Fun>
+ inline void traverse(scheduler::statement const & statement, vcl_size_t root_idx, Fun const & fun, bool inspect);
+
+ inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & type_key, std::string const & to_process,
+ scheduler::statement const & statement, vcl_size_t root_idx, mapping_type const & mapping, std::set<std::string> & already_processed);
+ inline std::string evaluate(leaf_t leaf, std::map<std::string, std::string> const & accessors, scheduler::statement const & statement, vcl_size_t root_idx,mapping_type const & mapping);
+}
+
+using scheduler::INT_TYPE;
+using scheduler::UINT_TYPE;
+using scheduler::ULONG_TYPE;
+using scheduler::LONG_TYPE;
+using scheduler::FLOAT_TYPE;
+using scheduler::DOUBLE_TYPE;
+
+typedef cl_uint vendor_id_type;
+typedef cl_device_type device_type;
+typedef std::string device_name_type;
+
+class symbolic_binder
+{
+public:
+ virtual ~symbolic_binder(){ }
+ virtual bool bind(viennacl::backend::mem_handle const * ph) = 0;
+ virtual unsigned int get(viennacl::backend::mem_handle const * ph) = 0;
+};
+
+class bind_to_handle : public symbolic_binder
+{
+public:
+ bind_to_handle() : current_arg_(0){ }
+ bool bind(viennacl::backend::mem_handle const * ph) {return (ph==NULL)?true:memory.insert(std::make_pair((void*)ph, current_arg_)).second; }
+ unsigned int get(viennacl::backend::mem_handle const * ph){ return bind(ph) ? current_arg_++ : at(memory, (void*)ph); }
+private:
+ unsigned int current_arg_;
+ std::map<void*,unsigned int> memory;
+};
+
+class bind_all_unique : public symbolic_binder
+{
+public:
+ bind_all_unique() : current_arg_(0){ }
+ bool bind(viennacl::backend::mem_handle const *) {return true; }
+ unsigned int get(viennacl::backend::mem_handle const *){ return current_arg_++; }
+private:
+ unsigned int current_arg_;
+ std::map<void*,unsigned int> memory;
+};
+
+enum binding_policy_t{
+ BIND_ALL_UNIQUE,
+ BIND_TO_HANDLE
+};
+
+inline tools::shared_ptr<symbolic_binder> make_binder(binding_policy_t policy)
+{
+ if (policy==BIND_TO_HANDLE)
+ return tools::shared_ptr<symbolic_binder>(new bind_to_handle());
+ else
+ return tools::shared_ptr<symbolic_binder>(new bind_all_unique());
+}
+
+template<char C>
+struct char_to_type{ };
+
+class statements_container
+{
+public:
+ typedef std::list<scheduler::statement> data_type;
+ enum order_type { SEQUENTIAL, INDEPENDENT };
+
+ statements_container(data_type const & data, order_type order) : data_(data), order_(order)
+ { }
+
+ statements_container(scheduler::statement const & s0) : order_(INDEPENDENT)
+ {
+ data_.push_back(s0);
+ }
+
+ statements_container(scheduler::statement const & s0, scheduler::statement const & s1, order_type order) : order_(order)
+ {
+ data_.push_back(s0);
+ data_.push_back(s1);
+ }
+
+ std::list<scheduler::statement> const & data() const { return data_; }
+
+ order_type order() const { return order_; }
+
+private:
+ std::list<scheduler::statement> data_;
+ order_type order_;
+};
+
+}
+
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/lazy_program_compiler.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/lazy_program_compiler.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/lazy_program_compiler.hpp
new file mode 100644
index 0000000..3e75b9b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/lazy_program_compiler.hpp
@@ -0,0 +1,74 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_LAZY_PROGRAM_COMPILER_HPP
+#define VIENNACL_DEVICE_SPECIFIC_LAZY_PROGRAM_COMPILER_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/lazy_program_compiler.hpp
+ @brief Helper for compiling a program lazily
+*/
+
+#include <map>
+
+#include "viennacl/ocl/context.hpp"
+
+namespace viennacl
+{
+
+namespace device_specific
+{
+
+ class lazy_program_compiler
+ {
+ public:
+
+ lazy_program_compiler(viennacl::ocl::context * ctx, std::string const & name, std::string const & src, bool force_recompilation) : ctx_(ctx), name_(name), src_(src), force_recompilation_(force_recompilation){ }
+ lazy_program_compiler(viennacl::ocl::context * ctx, std::string const & name, bool force_recompilation) : ctx_(ctx), name_(name), force_recompilation_(force_recompilation){ }
+
+ void add(std::string const & src) { src_+=src; }
+
+ std::string const & src() const { return src_; }
+
+ viennacl::ocl::program & program()
+ {
+ if (force_recompilation_ && ctx_->has_program(name_))
+ ctx_->delete_program(name_);
+ if (!ctx_->has_program(name_))
+ {
+#ifdef VIENNACL_BUILD_INFO
+ std::cerr << "Creating program " << program_name << std::endl;
+#endif
+ ctx_->add_program(src_, name_);
+#ifdef VIENNACL_BUILD_INFO
+ std::cerr << "Done creating program " << program_name << std::endl;
+#endif
+ }
+ return ctx_->get_program(name_);
+ }
+
+ private:
+ viennacl::ocl::context * ctx_;
+ std::string name_;
+ std::string src_;
+ bool force_recompilation_;
+ };
+
+}
+
+}
+#endif
[04/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
new file mode 100644
index 0000000..bd5116d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
@@ -0,0 +1,1019 @@
+#ifndef VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/matrix_operations.hpp
+ @brief Implementations of dense matrix related operations, including matrix-vector products, using OpenCL.
+*/
+
+#include "viennacl/forwards.h"
+
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/svd.hpp"
+#include "viennacl/linalg/opencl/kernels/vector.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix_element.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+namespace detail
+{
+
+ template<typename NumericT>
+ viennacl::ocl::kernel & kernel_for_matrix(matrix_base<NumericT> const & M, std::string const & kernel_name)
+ {
+ viennacl::ocl::context & ctx = traits::opencl_context(M);
+ viennacl::ocl::program * program;
+ if (M.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix<NumericT, row_major> KernelClass;
+ KernelClass::init(ctx);
+ program = &ctx.get_program(KernelClass::program_name());
+ }
+ else
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix<NumericT, column_major> KernelClass;
+ KernelClass::init(ctx);
+ program = &ctx.get_program(KernelClass::program_name());
+ }
+ return program->get_kernel(kernel_name);
+ }
+
+ template<typename NumericT>
+ viennacl::ocl::kernel & element_kernel_for_matrix(matrix_base<NumericT> const & M, std::string const & kernel_name)
+ {
+ viennacl::ocl::context & ctx = traits::opencl_context(M);
+ viennacl::ocl::program * program;
+ if (M.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_element<NumericT, row_major> KernelClass;
+ KernelClass::init(ctx);
+ program = &ctx.get_program(KernelClass::program_name());
+ }
+ else
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_element<NumericT, column_major> KernelClass;
+ KernelClass::init(ctx);
+ program = &ctx.get_program(KernelClass::program_name());
+ }
+ return program->get_kernel(kernel_name);
+ }
+
+ template<typename NumericT>
+ viennacl::ocl::kernel & legacy_kernel_for_matrix(matrix_base<NumericT> const & M, std::string const & kernel_name)
+ {
+ viennacl::ocl::context & ctx = traits::opencl_context(M);
+ viennacl::ocl::program * program;
+ if (M.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major> KernelClass;
+ KernelClass::init(ctx);
+ program = &ctx.get_program(KernelClass::program_name());
+ }
+ else
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major> KernelClass;
+ KernelClass::init(ctx);
+ program = &ctx.get_program(KernelClass::program_name());
+ }
+ return program->get_kernel(kernel_name);
+ }
+
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+const std::string SVD_BIDIAG_PACK_KERNEL = "bidiag_pack";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL = "house_update_A_left";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL = "house_update_A_right";
+const std::string SVD_HOUSEHOLDER_UPDATE_QL_KERNEL = "house_update_QL";
+const std::string SVD_GIVENS_NEXT_KERNEL = "givens_next";
+const std::string SVD_COPY_COL_KERNEL = "copy_col";
+const std::string SVD_COPY_ROW_KERNEL = "copy_row";
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & dest, matrix_base<SrcNumericT> const & src)
+{
+ assert(dest.row_major() == src.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ assert(viennacl::traits::opencl_handle(dest).context() == viennacl::traits::opencl_handle(src).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ std::string kernel_name("convert_");
+ kernel_name += dest.row_major() ? "row_" : "col_";
+ kernel_name += viennacl::ocl::type_to_string<DestNumericT>::apply();
+ kernel_name += "_";
+ kernel_name += viennacl::ocl::type_to_string<SrcNumericT>::apply();
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(dest).context());
+ viennacl::linalg::opencl::kernels::matrix_convert::init(ctx);
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::matrix_convert::program_name(), kernel_name);
+
+ viennacl::ocl::enqueue(k( dest, cl_uint(dest.start1()), cl_uint(dest.stride1()), cl_uint(dest.size1()), cl_uint(dest.internal_size1()), cl_uint(dest.start2()), cl_uint(dest.stride2()), cl_uint(dest.size2()), cl_uint(dest.internal_size2()),
+ src, cl_uint( src.start1()), cl_uint( src.stride1()), cl_uint( src.size1()), cl_uint( src.internal_size1()), cl_uint( src.start2()), cl_uint( src.stride2()), cl_uint( src.size2()), cl_uint( src.internal_size2())
+ ) );
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template <typename NumericT,
+ typename ScalarT1>
+void am(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ viennacl::ocl::kernel & k= detail::kernel_for_matrix(mat1, (viennacl::is_cpu_scalar<ScalarT1>::value ? "am_cpu" : "am_gpu"));
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+ cl_uint(viennacl::traits::start1(mat1)), cl_uint(viennacl::traits::start2(mat1)),
+ cl_uint(viennacl::traits::stride1(mat1)), cl_uint(viennacl::traits::stride2(mat1)),
+ cl_uint(viennacl::traits::size1(mat1)), cl_uint(viennacl::traits::size2(mat1)),
+ cl_uint(viennacl::traits::internal_size1(mat1)), cl_uint(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(mat2),
+ cl_uint(viennacl::traits::start1(mat2)), cl_uint(viennacl::traits::start2(mat2)),
+ cl_uint(viennacl::traits::stride1(mat2)), cl_uint(viennacl::traits::stride2(mat2)),
+ cl_uint(viennacl::traits::internal_size1(mat2)), cl_uint(viennacl::traits::internal_size2(mat2))
+ )
+ );
+}
+
+
+template <typename NumericT,
+ typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ std::string kernel_name;
+ if ( viennacl::is_cpu_scalar<ScalarT1>::value && viennacl::is_cpu_scalar<ScalarT2>::value)
+ kernel_name = "ambm_cpu_cpu";
+ else if ( viennacl::is_cpu_scalar<ScalarT1>::value && !viennacl::is_cpu_scalar<ScalarT2>::value)
+ kernel_name = "ambm_cpu_gpu";
+ else if (!viennacl::is_cpu_scalar<ScalarT1>::value && viennacl::is_cpu_scalar<ScalarT2>::value)
+ kernel_name = "ambm_gpu_cpu";
+ else
+ kernel_name = "ambm_gpu_gpu";
+
+ viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat1, kernel_name);
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+ cl_uint(viennacl::traits::start1(mat1)), cl_uint(viennacl::traits::start2(mat1)),
+ cl_uint(viennacl::traits::stride1(mat1)), cl_uint(viennacl::traits::stride2(mat1)),
+ cl_uint(viennacl::traits::size1(mat1)), cl_uint(viennacl::traits::size2(mat1)),
+ cl_uint(viennacl::traits::internal_size1(mat1)), cl_uint(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(mat2),
+ cl_uint(viennacl::traits::start1(mat2)), cl_uint(viennacl::traits::start2(mat2)),
+ cl_uint(viennacl::traits::stride1(mat2)), cl_uint(viennacl::traits::stride2(mat2)),
+ cl_uint(viennacl::traits::internal_size1(mat2)), cl_uint(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(beta)),
+ options_beta,
+ viennacl::traits::opencl_handle(mat3),
+ cl_uint(viennacl::traits::start1(mat3)), cl_uint(viennacl::traits::start2(mat3)),
+ cl_uint(viennacl::traits::stride1(mat3)), cl_uint(viennacl::traits::stride2(mat3)),
+ cl_uint(viennacl::traits::internal_size1(mat3)), cl_uint(viennacl::traits::internal_size2(mat3))
+ )
+ );
+}
+
+
+template <typename NumericT,
+ typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ std::string kernel_name;
+ if ( viennacl::is_cpu_scalar<ScalarT1>::value && viennacl::is_cpu_scalar<ScalarT2>::value)
+ kernel_name = "ambm_m_cpu_cpu";
+ else if ( viennacl::is_cpu_scalar<ScalarT1>::value && !viennacl::is_cpu_scalar<ScalarT2>::value)
+ kernel_name = "ambm_m_cpu_gpu";
+ else if (!viennacl::is_cpu_scalar<ScalarT1>::value && viennacl::is_cpu_scalar<ScalarT2>::value)
+ kernel_name = "ambm_m_gpu_cpu";
+ else
+ kernel_name = "ambm_m_gpu_gpu";
+
+ viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat1, kernel_name);
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+ cl_uint(viennacl::traits::start1(mat1)), cl_uint(viennacl::traits::start2(mat1)),
+ cl_uint(viennacl::traits::stride1(mat1)), cl_uint(viennacl::traits::stride2(mat1)),
+ cl_uint(viennacl::traits::size1(mat1)), cl_uint(viennacl::traits::size2(mat1)),
+ cl_uint(viennacl::traits::internal_size1(mat1)), cl_uint(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(mat2),
+ cl_uint(viennacl::traits::start1(mat2)), cl_uint(viennacl::traits::start2(mat2)),
+ cl_uint(viennacl::traits::stride1(mat2)), cl_uint(viennacl::traits::stride2(mat2)),
+ cl_uint(viennacl::traits::internal_size1(mat2)), cl_uint(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(beta)),
+ options_beta,
+ viennacl::traits::opencl_handle(mat3),
+ cl_uint(viennacl::traits::start1(mat3)), cl_uint(viennacl::traits::start2(mat3)),
+ cl_uint(viennacl::traits::stride1(mat3)), cl_uint(viennacl::traits::stride2(mat3)),
+ cl_uint(viennacl::traits::internal_size1(mat3)), cl_uint(viennacl::traits::internal_size2(mat3))
+ )
+ );
+}
+
+template<typename NumericT,
+ typename SizeT, typename DistanceT>
+void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy,
+ matrix_base<NumericT> & temp_trans)
+{
+ std::string kernel_name("trans_kernel");
+ viennacl::ocl::kernel& kernel = detail::legacy_kernel_for_matrix(proxy.lhs(),kernel_name);
+ viennacl::ocl::enqueue(kernel(proxy.lhs(),
+ static_cast<cl_uint>(proxy.lhs().start1()), static_cast<cl_uint>(proxy.lhs().start2()),
+ static_cast<cl_uint>(proxy.lhs().internal_size1()), static_cast<cl_uint>(proxy.lhs().internal_size2()),
+ static_cast<cl_uint>(proxy.lhs().size1()), static_cast<cl_uint>(proxy.lhs().size2()),
+ static_cast<cl_uint>(proxy.lhs().stride1()), static_cast<cl_uint>(proxy.lhs().stride2()),
+
+ temp_trans,
+ static_cast<cl_uint>(temp_trans.start1()), static_cast<cl_uint>(temp_trans.start2()),
+ static_cast<cl_uint>(temp_trans.internal_size1()), static_cast<cl_uint>(temp_trans.internal_size2()),
+ static_cast<cl_uint>(temp_trans.stride1()), static_cast<cl_uint>(temp_trans.stride2())));
+}
+
+template <typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+ cl_uint s1 = clear ? cl_uint(viennacl::traits::internal_size1(mat)) : cl_uint(viennacl::traits::size1(mat));
+ cl_uint s2 = clear ? cl_uint(viennacl::traits::internal_size2(mat)) : cl_uint(viennacl::traits::size2(mat));
+
+ viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, "assign_cpu");
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+ cl_uint(viennacl::traits::start1(mat)), cl_uint(viennacl::traits::start2(mat)),
+ cl_uint(viennacl::traits::stride1(mat)), cl_uint(viennacl::traits::stride2(mat)),
+ s1, s2,
+ cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(s))
+ )
+ );
+}
+
+template <typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+ viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, "diagonal_assign_cpu");
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+ cl_uint(viennacl::traits::start1(mat)), cl_uint(viennacl::traits::start2(mat)),
+ cl_uint(viennacl::traits::stride1(mat)), cl_uint(viennacl::traits::stride2(mat)),
+ cl_uint(viennacl::traits::size1(mat)), cl_uint(viennacl::traits::size2(mat)),
+ cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(s))
+ )
+ );
+}
+
+template <typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+ // Step 1: set everything to zero
+ matrix_assign(mat, NumericT(0));
+
+ // Step 2: set the diagonal:
+
+ // reuse vector ambm kernel for assigning the elements:
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+ typedef viennacl::linalg::opencl::kernels::vector<NumericT> KernelClass;
+ KernelClass::init(ctx);
+
+ cl_uint options_alpha = 0;
+ viennacl::ocl::packed_cl_uint size_mat;
+ if (mat.row_major())
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ size_mat.start = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+ + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat));
+ size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
+ size_mat.size = cl_uint(viennacl::traits::size(vec));
+ size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ }
+ else
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ size_mat.start = cl_uint( viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+ + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+ size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
+ size_mat.size = cl_uint(viennacl::traits::size(vec));
+ size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ }
+
+ viennacl::ocl::packed_cl_uint size_vec;
+ size_vec.start = cl_uint(viennacl::traits::start(vec));
+ size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+ size_vec.size = cl_uint(viennacl::traits::size(vec));
+ size_vec.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+
+ viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+ viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(mat),
+ size_mat,
+
+ viennacl::traits::opencl_handle(NumericT(1)),
+ options_alpha,
+ viennacl::traits::opencl_handle(vec),
+ size_vec)
+ );
+}
+
+template <typename NumericT>
+void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, vector_base<NumericT> & vec)
+{
+ // reuse vector ambm kernel for assigning the elements:
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+ typedef viennacl::linalg::opencl::kernels::vector<NumericT> KernelClass;
+ KernelClass::init(ctx);
+
+ cl_uint options_alpha = 0;
+ viennacl::ocl::packed_cl_uint size_mat;
+ if (mat.row_major())
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ size_mat.start = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+ + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat));
+ size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
+ size_mat.size = cl_uint(viennacl::traits::size(vec));
+ size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ }
+ else
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ size_mat.start = cl_uint( viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+ + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+ size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
+ size_mat.size = cl_uint(viennacl::traits::size(vec));
+ size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ }
+
+ viennacl::ocl::packed_cl_uint size_vec;
+ size_vec.start = cl_uint(viennacl::traits::start(vec));
+ size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+ size_vec.size = cl_uint(viennacl::traits::size(vec));
+ size_vec.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+
+
+ viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+ viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+ size_vec,
+
+ viennacl::traits::opencl_handle(NumericT(1)),
+ options_alpha,
+ viennacl::traits::opencl_handle(mat),
+ size_mat)
+ );
+}
+
+template <typename NumericT>
+void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+ // reuse vector ambm kernel for assigning the elements:
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+ typedef viennacl::linalg::opencl::kernels::vector<NumericT> KernelClass;
+ KernelClass::init(ctx);
+
+ cl_uint options_alpha = 0;
+ viennacl::ocl::packed_cl_uint size_mat;
+ if (mat.row_major())
+ {
+ size_mat.start = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat));
+ size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
+ size_mat.size = cl_uint(viennacl::traits::size(vec));
+ size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ }
+ else
+ {
+ size_mat.start = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat));
+ size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat));
+ size_mat.size = cl_uint(viennacl::traits::size(vec));
+ size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ }
+
+ viennacl::ocl::packed_cl_uint size_vec;
+ size_vec.start = cl_uint(viennacl::traits::start(vec));
+ size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+ size_vec.size = cl_uint(viennacl::traits::size(vec));
+ size_vec.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+
+
+ viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+ viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+ size_vec,
+
+ viennacl::traits::opencl_handle(NumericT(1)),
+ options_alpha,
+ viennacl::traits::opencl_handle(mat),
+ size_mat)
+ );
+}
+
+template <typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+ // reuse vector ambm kernel for assigning the elements:
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
+ typedef viennacl::linalg::opencl::kernels::vector<NumericT> KernelClass;
+ KernelClass::init(ctx);
+
+ cl_uint options_alpha = 0;
+ viennacl::ocl::packed_cl_uint size_mat;
+ if (mat.row_major())
+ {
+ size_mat.start = cl_uint(viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat));
+ size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat));
+ size_mat.size = cl_uint(viennacl::traits::size(vec));
+ size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ }
+ else
+ {
+ size_mat.start = cl_uint(viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
+ size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
+ size_mat.size = cl_uint(viennacl::traits::size(vec));
+ size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+ }
+
+ viennacl::ocl::packed_cl_uint size_vec;
+ size_vec.start = cl_uint(viennacl::traits::start(vec));
+ size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+ size_vec.size = cl_uint(viennacl::traits::size(vec));
+ size_vec.internal_size = cl_uint(viennacl::traits::internal_size(vec));
+
+
+ viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
+ viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+ size_vec,
+
+ viennacl::traits::opencl_handle(NumericT(1)),
+ options_alpha,
+ viennacl::traits::opencl_handle(mat),
+ size_mat)
+ );
+}
+
+
+//
+///////////////////////// Element-wise operation //////////////////////////////////
+//
+
+// Binary operations A = B .* C and A = B ./ C
+/** @brief Implementation of binary element-wise operations A = OP(B,C)
+*
+* @param A The result matrix (or -range, or -slice)
+* @param proxy The proxy object holding B, C, and the operation
+*/
+template <typename T, typename OP>
+void element_op(matrix_base<T> & A,
+ matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_binary<OP> > const & proxy)
+{
+ assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+ assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::kernel & k = detail::kernel_for_matrix(A, "element_op");
+
+ cl_uint op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OP>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OP>::value)
+ op_type = 0;
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+ cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
+ cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
+ cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+
+ viennacl::traits::opencl_handle(proxy.lhs()),
+ cl_uint(viennacl::traits::start1(proxy.lhs())), cl_uint(viennacl::traits::start2(proxy.lhs())),
+ cl_uint(viennacl::traits::stride1(proxy.lhs())), cl_uint(viennacl::traits::stride2(proxy.lhs())),
+ cl_uint(viennacl::traits::internal_size1(proxy.lhs())), cl_uint(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::traits::opencl_handle(proxy.rhs()),
+ cl_uint(viennacl::traits::start1(proxy.rhs())), cl_uint(viennacl::traits::start2(proxy.rhs())),
+ cl_uint(viennacl::traits::stride1(proxy.rhs())), cl_uint(viennacl::traits::stride2(proxy.rhs())),
+ cl_uint(viennacl::traits::internal_size1(proxy.rhs())), cl_uint(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type)
+ );
+}
+
+
+// Unary operations
+
+/** @brief Implementation of unary element-wise operations A = OP(B)
+*
+* @param A The result matrix (or -range, or -slice)
+* @param proxy The proxy object holding B and the operation
+*/
+template <typename T, typename OP>
+void element_op(matrix_base<T> & A,
+ matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_unary<OP> > const & proxy)
+{
+ assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+ assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Matrices do not reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+ viennacl::ocl::kernel & k = detail::element_kernel_for_matrix(A, detail::op_to_string(OP()) + "_assign");
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+ cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
+ cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
+ cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+
+ viennacl::traits::opencl_handle(proxy.lhs()),
+ cl_uint(viennacl::traits::start1(proxy.lhs())), cl_uint(viennacl::traits::start2(proxy.lhs())),
+ cl_uint(viennacl::traits::stride1(proxy.lhs())), cl_uint(viennacl::traits::stride2(proxy.lhs())),
+ cl_uint(viennacl::traits::internal_size1(proxy.lhs())), cl_uint(viennacl::traits::internal_size2(proxy.lhs())))
+ );
+}
+
+
+//
+///////////////////////// matrix-vector products /////////////////////////////////
+//
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template <typename NumericT>
+void prod_impl(const matrix_base<NumericT> & mat, bool trans_A,
+ const vector_base<NumericT> & vec,
+ vector_base<NumericT> & result)
+{
+ assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!"));
+
+ viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, trans_A ? "trans_vec_mul" : "vec_mul");
+
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+ cl_uint(viennacl::traits::start1(mat)), cl_uint(viennacl::traits::start2(mat)),
+ cl_uint(viennacl::traits::stride1(mat)), cl_uint(viennacl::traits::stride2(mat)),
+ cl_uint(viennacl::traits::size1(mat)), cl_uint(viennacl::traits::size2(mat)),
+ cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
+
+ viennacl::traits::opencl_handle(vec),
+ cl_uint(viennacl::traits::start(vec)),
+ cl_uint(viennacl::traits::stride(vec)),
+ cl_uint(viennacl::traits::size(vec)),
+
+ viennacl::traits::opencl_handle(result),
+ cl_uint(viennacl::traits::start(result)),
+ cl_uint(viennacl::traits::stride(result)),
+ cl_uint(viennacl::traits::size(result)),
+
+ viennacl::ocl::local_mem(sizeof(NumericT) * k.local_work_size())
+ ) );
+}
+
+
+//
+
+
+/** @brief Carries out matrix-matrix multiplication
+*
+* Implementation of C = prod(A, B);
+*
+*/
+template<typename NumericT, typename ScalarType >
+void prod_impl(matrix_base<NumericT> const & A, bool A_trans,
+ matrix_base<NumericT> const & B, bool B_trans,
+ matrix_base<NumericT> & C,
+ ScalarType alpha,
+ ScalarType beta)
+{
+ bool effective_A_trans = A_trans ^ A.row_major();
+ bool effective_B_trans = B_trans ^ B.row_major();
+
+ char cAt = effective_A_trans ? 'T' : 'N';
+ char cBt = effective_B_trans ? 'T' : 'N';
+
+ std::string kernel_prefix("prod_");
+ kernel_prefix+=cAt;
+ kernel_prefix+=cBt;
+
+ scheduler::statement statement = scheduler::preset::mat_mat_prod(alpha, &A, effective_A_trans, &B, effective_B_trans, beta, &C);
+ kernels::matrix_prod<NumericT>::execution_handler(C.row_major(), viennacl::traits::opencl_context(C)).execute(kernel_prefix, statement);
+}
+
+//
+///////////////////////// miscellaneous operations /////////////////////////////////
+//
+
+
+/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+*
+* Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+*
+* @param A The matrix to be updated
+* @param alpha The scaling factor (either a viennacl::scalar<>, float, or double)
+* @param len_alpha Length of the buffer for an eventual final reduction step (currently always '1')
+* @param reciprocal_alpha Use 1/alpha instead of alpha
+* @param flip_sign_alpha Use -alpha instead of alpha
+* @param vec1 The first vector
+* @param vec2 The second vector
+*/
+template<typename NumericT, typename ScalarT1>
+void scaled_rank_1_update(matrix_base<NumericT> & A,
+ ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ const vector_base<NumericT> & vec1,
+ const vector_base<NumericT> & vec2)
+{
+ assert( (viennacl::traits::size1(A) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
+ assert( (viennacl::traits::size2(A) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ bool is_cpu = viennacl::is_cpu_scalar<ScalarT1>::value;
+ viennacl::ocl::kernel& kernel= detail::legacy_kernel_for_matrix(A, is_cpu ? "scaled_rank1_update_cpu" : "scaled_rank1_update_gpu");
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(A),
+ cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
+ cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
+ cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+ options_alpha,
+
+ viennacl::traits::opencl_handle(vec1),
+ cl_uint(viennacl::traits::start(vec1)),
+ cl_uint(viennacl::traits::stride(vec1)),
+ cl_uint(viennacl::traits::size(vec1)),
+
+ viennacl::traits::opencl_handle(vec2),
+ cl_uint(viennacl::traits::start(vec2)),
+ cl_uint(viennacl::traits::stride(vec2)),
+ cl_uint(viennacl::traits::size(vec2))
+ )
+ );
+}
+
+//
+template <typename SCALARTYPE, typename VectorType>
+void bidiag_pack_svd(viennacl::matrix<SCALARTYPE>& A,
+ VectorType & dh,
+ VectorType & sh
+ )
+{
+ viennacl::vector<SCALARTYPE> D(dh.size());
+ viennacl::vector<SCALARTYPE> S(sh.size());
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(
+ A,
+ D,
+ S,
+ static_cast<cl_uint>(A.size1()),
+ static_cast<cl_uint>(A.size2()),
+ static_cast<cl_uint>(A.internal_size2())
+ ));
+
+ fast_copy(D, dh);
+ fast_copy(S, sh);
+}
+
+
+template <typename NumericT>
+void bidiag_pack(matrix_base<NumericT> & A,
+ viennacl::vector<NumericT> & dh,
+ viennacl::vector<NumericT> & sh
+ )
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+ if(A.row_major())
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(
+ A,
+ dh,
+ sh,
+ cl_uint(viennacl::traits::size1(A)),
+ cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size2(A))
+ ));
+ }
+ else
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(
+ A,
+ dh,
+ sh,
+ cl_uint(viennacl::traits::size1(A)),
+ cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size2(A))
+ ));
+ }
+}
+
+
+template <typename NumericT>
+void house_update_A_left(matrix_base<NumericT> & A,
+ vector_base<NumericT> & D,
+ vcl_size_t start)
+{
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ if(A.row_major())
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+ viennacl::ocl::enqueue(kernel(
+ A,
+ D,
+ static_cast<cl_uint>(start + 1),
+ static_cast<cl_uint>(start),
+ cl_uint(viennacl::traits::size1(A)),
+ cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size2(A)),
+ viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
+ ));
+ }
+ else
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+ viennacl::ocl::enqueue(kernel(
+ A,
+ D,
+ static_cast<cl_uint>(start + 1),
+ static_cast<cl_uint>(start),
+ cl_uint(viennacl::traits::size1(A)),
+ cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size2(A)),
+ viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
+ ));
+ }
+
+
+
+
+}
+
+template <typename NumericT>
+void house_update_A_right(matrix_base<NumericT> & A,
+ vector_base<NumericT> & D)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+ if(A.row_major())
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(
+ A,
+ D,
+ static_cast<cl_uint>(0),
+ static_cast<cl_uint>(0),
+ cl_uint(viennacl::traits::size1(A)),
+ cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size2(A)),
+ viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+ ));
+ }
+ else
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(
+ A,
+ D,
+ static_cast<cl_uint>(0),
+ static_cast<cl_uint>(0),
+ cl_uint(viennacl::traits::size1(A)),
+ cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size2(A)),
+ viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+ ));
+ }
+
+
+}
+
+
+
+template <typename NumericT>
+void house_update_QL(matrix_base<NumericT> & Q,
+ vector_base<NumericT> & D,
+ vcl_size_t A_size1)
+
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(Q).context());
+
+ if(Q.row_major())
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(
+ Q,
+ D,
+ cl_uint(A_size1),
+ cl_uint(viennacl::traits::internal_size2(Q)),
+ viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+ ));
+ }
+ else
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+
+ viennacl::ocl::enqueue(kernel(
+ Q,
+ D,
+ cl_uint(A_size1),
+ cl_uint(viennacl::traits::internal_size2(Q)),
+ viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+ ));
+ }
+
+}
+
+
+template<typename NumericT>
+ void givens_next(matrix_base<NumericT> & matrix,
+ vector_base<NumericT>& tmp1,
+ vector_base<NumericT>& tmp2,
+ int l,
+ int m
+ )
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
+
+ if(matrix.row_major())
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), SVD_GIVENS_NEXT_KERNEL);
+ kernel.global_work_size(0, viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)), 256));
+ kernel.local_work_size(0, 256);
+
+ viennacl::ocl::enqueue(kernel(
+ matrix,
+ tmp1,
+ tmp2,
+ cl_uint(viennacl::traits::size1(matrix)),
+ cl_uint(viennacl::traits::internal_size2(matrix)),
+ static_cast<cl_uint>(l),
+ static_cast<cl_uint>(m - 1)
+ ));
+ }
+ else
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), SVD_GIVENS_NEXT_KERNEL);
+ kernel.global_work_size(0, viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)), 256));
+ kernel.local_work_size(0, 256);
+
+ viennacl::ocl::enqueue(kernel(
+ matrix,
+ tmp1,
+ tmp2,
+ cl_uint(viennacl::traits::size1(matrix)),
+ cl_uint(viennacl::traits::internal_size2(matrix)),
+ static_cast<cl_uint>(l),
+ static_cast<cl_uint>(m - 1)
+ ));
+ }
+
+
+ }
+
+ template <typename NumericT>
+ void copy_vec(matrix_base<NumericT>& A,
+ vector_base<NumericT> & V,
+ vcl_size_t row_start,
+ vcl_size_t col_start,
+ bool copy_col
+ )
+ {
+ std::string kernel_name = copy_col ? SVD_COPY_COL_KERNEL : SVD_COPY_ROW_KERNEL;
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+ if(A.row_major())
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::program_name(), kernel_name);
+
+ viennacl::ocl::enqueue(kernel(
+ A,
+ V,
+ static_cast<cl_uint>(row_start),
+ static_cast<cl_uint>(col_start),
+ copy_col ? cl_uint(viennacl::traits::size1(A))
+ : cl_uint(viennacl::traits::size2(A)),
+ static_cast<cl_uint>(A.internal_size2())
+ ));
+ }
+ else
+ {
+ viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::init(ctx);
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, column_major>::program_name(), kernel_name);
+
+ viennacl::ocl::enqueue(kernel(
+ A,
+ V,
+ static_cast<cl_uint>(row_start),
+ static_cast<cl_uint>(col_start),
+ copy_col ? cl_uint(viennacl::traits::size1(A))
+ : cl_uint(viennacl::traits::size2(A)),
+ static_cast<cl_uint>(A.internal_size2())
+ ));
+ }
+
+
+ }
+
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
new file mode 100644
index 0000000..83a3db7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
@@ -0,0 +1,69 @@
+#ifndef VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/misc_operations.hpp
+ @brief Implementations of operations using compressed_matrix and OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/ilu.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace detail
+{
+
+template<typename NumericT>
+void level_scheduling_substitute(vector<NumericT> & x,
+ viennacl::backend::mem_handle const & row_index_array,
+ viennacl::backend::mem_handle const & row_buffer,
+ viennacl::backend::mem_handle const & col_buffer,
+ viennacl::backend::mem_handle const & element_buffer,
+ vcl_size_t num_rows
+ )
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "level_scheduling_substitute");
+
+ viennacl::ocl::enqueue(k(row_index_array.opencl_handle(), row_buffer.opencl_handle(), col_buffer.opencl_handle(), element_buffer.opencl_handle(),
+ x,
+ static_cast<cl_uint>(num_rows)));
+}
+
+} //namespace detail
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
new file mode 100644
index 0000000..5daf297
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
@@ -0,0 +1,139 @@
+#ifndef VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/opencl/vector_operations.hpp
+ @brief Implementations of NMF operations using OpenCL
+ */
+
+#include "viennacl/linalg/opencl/kernels/nmf.hpp"
+#include "viennacl/linalg/opencl/nmf_operations.hpp"
+
+#include "viennacl/linalg/host_based/nmf_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+/** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+ *
+ * @param V Input matrix
+ * @param W First factor
+ * @param H Second factor
+ * @param conf A configuration object holding tolerances and the like
+ */
+template<typename NumericT>
+void nmf(viennacl::matrix_base<NumericT> const & V,
+ viennacl::matrix_base<NumericT> & W,
+ viennacl::matrix_base<NumericT> & H,
+ viennacl::linalg::nmf_config const & conf)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(V).context());
+
+ const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";
+
+ viennacl::linalg::opencl::kernels::nmf<NumericT>::init(ctx);
+
+ vcl_size_t k = W.size2();
+ conf.iters_ = 0;
+
+ if (viennacl::linalg::norm_frobenius(W) <= 0)
+ W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1), ctx);
+
+ if (viennacl::linalg::norm_frobenius(H) <= 0)
+ H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1), ctx);
+
+ viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major(), ctx);
+ viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major(), ctx);
+ viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major(), ctx);
+
+ viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major(), ctx);
+ viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major(), ctx);
+ viennacl::matrix_base<NumericT> htmp(k, k, H.row_major(), ctx);
+
+ viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major(), ctx);
+
+ NumericT last_diff = 0;
+ NumericT diff_init = 0;
+ bool stagnation_flag = false;
+
+ for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+ {
+ conf.iters_ = i + 1;
+ {
+ hn = viennacl::linalg::prod(trans(W), V);
+ htmp = viennacl::linalg::prod(trans(W), W);
+ hd = viennacl::linalg::prod(htmp, H);
+
+ viennacl::ocl::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(), NMF_MUL_DIV_KERNEL);
+ viennacl::ocl::enqueue(mul_div_kernel(H, hn, hd, cl_uint(H.internal_size1() * H.internal_size2())));
+ }
+ {
+ wn = viennacl::linalg::prod(V, trans(H));
+ wtmp = viennacl::linalg::prod(W, H);
+ wd = viennacl::linalg::prod(wtmp, trans(H));
+
+ viennacl::ocl::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(), NMF_MUL_DIV_KERNEL);
+
+ viennacl::ocl::enqueue(mul_div_kernel(W, wn, wd, cl_uint(W.internal_size1() * W.internal_size2())));
+ }
+
+ if (i % conf.check_after_steps() == 0) //check for convergence
+ {
+ appr = viennacl::linalg::prod(W, H);
+
+ appr -= V;
+ NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
+
+ if (i == 0)
+ diff_init = diff_val;
+
+ if (conf.print_relative_error())
+ std::cout << diff_val / diff_init << std::endl;
+
+ // Approximation check
+ if (diff_val / diff_init < conf.tolerance())
+ break;
+
+ // Stagnation check
+ if (std::fabs(diff_val - last_diff) / (diff_val * NumericT(conf.check_after_steps())) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates
+ {
+ if (stagnation_flag) // iteration stagnates (two iterates with no notable progress)
+ break;
+ else
+ // record stagnation in this iteration
+ stagnation_flag = true;
+ } else
+ // good progress in this iteration, so unset stagnation flag
+ stagnation_flag = false;
+
+ // prepare for next iterate:
+ last_diff = diff_val;
+ }
+ }
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
new file mode 100644
index 0000000..a94681f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
@@ -0,0 +1,205 @@
+#ifndef VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/scalar_operations.hpp
+ @brief Implementations of scalar operations using OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/scalar.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+template<typename ScalarT1,
+ typename ScalarT2, typename NumericT>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_any_scalar<NumericT>::value
+ >::type
+as(ScalarT1 & s1,
+ ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+ viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ bool is_cpu = viennacl::is_cpu_scalar<NumericT>::value;
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), is_cpu ? "as_cpu" : "as_gpu");
+ k.local_work_size(0, 1);
+ k.global_work_size(0, 1);
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(s2) )
+ );
+}
+
+
+template<typename ScalarT1,
+ typename ScalarT2, typename NumericT2,
+ typename ScalarT3, typename NumericT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_scalar<ScalarT3>::value
+ && viennacl::is_any_scalar<NumericT2>::value
+ && viennacl::is_any_scalar<NumericT3>::value
+ >::type
+asbs(ScalarT1 & s1,
+ ScalarT2 const & s2, NumericT2 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ ScalarT3 const & s3, NumericT3 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+ assert( &viennacl::traits::opencl_handle(s2).context() == &viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the same OpenCL context!"));
+
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+ viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+ std::string kernel_name;
+ bool is_cpu_2 = viennacl::is_cpu_scalar<NumericT2>::value;
+ bool is_cpu_3 = viennacl::is_cpu_scalar<NumericT3>::value;
+ if (is_cpu_2 && is_cpu_3)
+ kernel_name = "asbs_cpu_cpu";
+ else if (is_cpu_2 && !is_cpu_3)
+ kernel_name = "asbs_cpu_gpu";
+ else if (!is_cpu_2 && is_cpu_3)
+ kernel_name = "asbs_gpu_cpu";
+ else
+ kernel_name = "asbs_gpu_gpu";
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), kernel_name);
+ k.local_work_size(0, 1);
+ k.global_work_size(0, 1);
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(s2),
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+ options_beta,
+ viennacl::traits::opencl_handle(s3) )
+ );
+}
+
+
+template<typename ScalarT1,
+ typename ScalarT2, typename NumericT2,
+ typename ScalarT3, typename NumericT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ && viennacl::is_scalar<ScalarT3>::value
+ && viennacl::is_any_scalar<NumericT2>::value
+ && viennacl::is_any_scalar<NumericT3>::value
+ >::type
+asbs_s(ScalarT1 & s1,
+ ScalarT2 const & s2, NumericT2 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ ScalarT3 const & s3, NumericT3 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+ assert( &viennacl::traits::opencl_handle(s2).context() == &viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the same OpenCL context!"));
+
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+ viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+ std::string kernel_name;
+ if (viennacl::is_cpu_scalar<NumericT2>::value && viennacl::is_cpu_scalar<NumericT3>::value)
+ kernel_name = "asbs_s_cpu_cpu";
+ else if (viennacl::is_cpu_scalar<NumericT2>::value && !viennacl::is_cpu_scalar<NumericT3>::value)
+ kernel_name = "asbs_s_cpu_gpu";
+ else if (!viennacl::is_cpu_scalar<NumericT2>::value && viennacl::is_cpu_scalar<NumericT3>::value)
+ kernel_name = "asbs_s_gpu_cpu";
+ else
+ kernel_name = "asbs_s_gpu_gpu";
+
+ cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+ cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), kernel_name);
+ k.local_work_size(0, 1);
+ k.global_work_size(0, 1);
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+ options_alpha,
+ viennacl::traits::opencl_handle(s2),
+ viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+ options_beta,
+ viennacl::traits::opencl_handle(s3) )
+ );
+}
+
+
+/** @brief Swaps the contents of two scalars, data is copied
+*
+* @param s1 The first scalar
+* @param s2 The second scalar
+*/
+template<typename ScalarT1, typename ScalarT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+ && viennacl::is_scalar<ScalarT2>::value
+ >::type
+swap(ScalarT1 & s1, ScalarT2 & s2)
+{
+ assert( &viennacl::traits::opencl_handle(s1).context() == &viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the same OpenCL context!"));
+
+ typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type;
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s1).context());
+ viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(), "swap");
+ k.local_work_size(0, 1);
+ k.global_work_size(0, 1);
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+ viennacl::traits::opencl_handle(s2))
+ );
+}
+
+
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[31/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
new file mode 100644
index 0000000..51d99e1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
@@ -0,0 +1,2809 @@
+#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp
+ @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+#include "viennacl/linalg/cuda/vector_operations.hpp"
+
+#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
+
+//#ifdef VIENNACL_WITH_SPGEMM_RMERGE
+ #include "viennacl/linalg/cuda/spgemm_rmerge.hpp"
+//#else
+// #include "viennacl/linalg/cuda/spgemm.hpp"
+//#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+//
+// Compressed matrix
+//
+
+namespace detail
+{
+
+ template<typename NumericT>
+ __global__ void csr_row_info_extractor_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * result,
+ unsigned int size,
+ unsigned int option)
+ {
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < size;
+ row += gridDim.x * blockDim.x)
+ {
+ NumericT value = 0;
+ unsigned int row_end = row_indices[row+1];
+
+ switch (option)
+ {
+ case 0: //inf-norm
+ for (unsigned int i = row_indices[row]; i < row_end; ++i)
+ value = max(value, fabs(elements[i]));
+ break;
+
+ case 1: //1-norm
+ for (unsigned int i = row_indices[row]; i < row_end; ++i)
+ value += fabs(elements[i]);
+ break;
+
+ case 2: //2-norm
+ for (unsigned int i = row_indices[row]; i < row_end; ++i)
+ value += elements[i] * elements[i];
+ value = sqrt(value);
+ break;
+
+ case 3: //diagonal entry
+ for (unsigned int i = row_indices[row]; i < row_end; ++i)
+ {
+ if (column_indices[i] == row)
+ {
+ value = elements[i];
+ break;
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+ result[row] = value;
+ }
+ }
+
+
+ template<typename NumericT, unsigned int AligmentV>
+ void row_info(compressed_matrix<NumericT, AligmentV> const & mat,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::detail::row_info_types info_selector)
+ {
+ csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.size1()),
+ static_cast<unsigned int>(info_selector)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
+ }
+
+ struct spmv_pure
+ {
+ template<typename NumericT>
+ __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = Ax; }
+ };
+
+ struct spmv_alpha_beta
+ {
+ template<typename NumericT>
+ __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = alpha * Ax + ((beta != 0) ? beta * result : 0); }
+ };
+
+} //namespace detail
+
+
+
+template<unsigned int SubWarpSizeV, typename AlphaBetaHandlerT, typename NumericT>
+__global__ void compressed_matrix_vec_mul_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ const NumericT * x,
+ unsigned int start_x,
+ unsigned int inc_x,
+ NumericT alpha,
+ NumericT * result,
+ unsigned int start_result,
+ unsigned int inc_result,
+ unsigned int size_result,
+ NumericT beta)
+{
+ __shared__ NumericT shared_elements[512];
+
+ const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
+ const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1);
+ const unsigned int block_start = blockIdx.x * block_increment;
+ const unsigned int block_stop = min(block_start + block_increment, size_result);
+
+ for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
+ row < block_stop;
+ row += blockDim.x / SubWarpSizeV)
+ {
+ NumericT dot_prod = NumericT(0);
+ unsigned int row_end = row_indices[row+1];
+ for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
+ dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
+
+ shared_elements[threadIdx.x] = dot_prod;
+ if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
+ if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
+ if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
+ if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
+ if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
+
+ if (id_in_row == 0)
+ AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, shared_elements[threadIdx.x], beta);
+ }
+}
+
+
+template<typename AlphaBetaHandlerT, typename NumericT>
+__global__ void compressed_matrix_vec_mul_adaptive_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const unsigned int * row_blocks,
+ const NumericT * elements,
+ unsigned int num_blocks,
+ const NumericT * x,
+ unsigned int start_x,
+ unsigned int inc_x,
+ NumericT alpha,
+ NumericT * result,
+ unsigned int start_result,
+ unsigned int inc_result,
+ unsigned int size_result,
+ NumericT beta)
+{
+ __shared__ NumericT shared_elements[1024];
+
+ for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
+ {
+ unsigned int row_start = row_blocks[block_id];
+ unsigned int row_stop = row_blocks[block_id + 1];
+ unsigned int element_start = row_indices[row_start];
+ unsigned int element_stop = row_indices[row_stop];
+ unsigned int rows_to_process = row_stop - row_start;
+
+ if (rows_to_process > 1) // CSR stream with one thread per row
+ {
+ // load to shared buffer:
+ for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+ shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
+
+ __syncthreads();
+
+ // use one thread per row to sum:
+ for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
+ {
+ NumericT dot_prod = 0;
+ unsigned int thread_row_start = row_indices[row] - element_start;
+ unsigned int thread_row_stop = row_indices[row + 1] - element_start;
+ for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
+ dot_prod += shared_elements[i];
+ AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, dot_prod, beta);
+ }
+ }
+ // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
+ else // CSR vector for a single row
+ {
+ // load and sum to shared buffer:
+ shared_elements[threadIdx.x] = 0;
+ for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+ shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
+
+ // reduction to obtain final result
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
+ }
+
+ if (threadIdx.x == 0)
+ AlphaBetaHandlerT::apply(result[row_start * inc_result + start_result], alpha, shared_elements[0], beta);
+ }
+
+ __syncthreads(); // avoid race conditions
+ }
+}
+
+
+
+
+/** @brief Carries out matrix-vector multiplication with a compressed_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<class NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ static bool first = true;
+ static bool is_maxwell = false;
+
+ // check whether the CUDA device is from the Maxwell family.
+ // Only run once, because the query to the backend takes about the same time as a kernel launch (~15us), thus being too expensive to query each time.
+ //
+ // Note: This might result in non-optimal kernels being selected if multiple Maxwell- and non-Maxwell GPUs are available in the system and devices are switched at runtime.
+ // However, this situation is certainly rare, hence the the benefits of this singleton outweigh the disadvantages encountered in such a corner case.
+ if (first)
+ {
+ cudaDeviceProp prop;
+ int device_index = 0;
+
+ cudaError_t err_flag = cudaGetDevice(&device_index);
+ if (err_flag == cudaSuccess)
+ {
+ err_flag = cudaGetDeviceProperties(&prop, device_index);
+ if (err_flag == cudaSuccess && prop.major >= 5)
+ is_maxwell = true;
+ }
+ first = false;
+ }
+
+ if (is_maxwell && double(mat.nnz()) / double(mat.size1()) > 6.4) // less than 10% of threads expected to idle
+ {
+ if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
+ compressed_matrix_vec_mul_kernel<8, detail::spmv_alpha_beta, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+ viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(vec.start()),
+ static_cast<unsigned int>(vec.stride()),
+ alpha,
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(result.start()),
+ static_cast<unsigned int>(result.stride()),
+ static_cast<unsigned int>(result.size()),
+ beta
+ );
+ else
+ compressed_matrix_vec_mul_kernel<8, detail::spmv_pure, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+ viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(vec.start()),
+ static_cast<unsigned int>(vec.stride()),
+ alpha,
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(result.start()),
+ static_cast<unsigned int>(result.stride()),
+ static_cast<unsigned int>(result.size()),
+ beta
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
+ }
+ else if (!is_maxwell && double(mat.nnz()) / double(mat.size1()) > 12.0) // less than 25% of threads expected to idle
+ {
+ if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
+ compressed_matrix_vec_mul_kernel<16, detail::spmv_alpha_beta, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
+ viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(vec.start()),
+ static_cast<unsigned int>(vec.stride()),
+ alpha,
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(result.start()),
+ static_cast<unsigned int>(result.stride()),
+ static_cast<unsigned int>(result.size()),
+ beta
+ );
+ else
+ compressed_matrix_vec_mul_kernel<16, detail::spmv_pure, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
+ viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(vec.start()),
+ static_cast<unsigned int>(vec.stride()),
+ alpha,
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(result.start()),
+ static_cast<unsigned int>(result.stride()),
+ static_cast<unsigned int>(result.size()),
+ beta
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
+ }
+ else
+ {
+ if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
+ compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_alpha_beta><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<unsigned int>(mat.handle3()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ static_cast<unsigned int>(mat.blocks1()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(vec.start()),
+ static_cast<unsigned int>(vec.stride()),
+ alpha,
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(result.start()),
+ static_cast<unsigned int>(result.stride()),
+ static_cast<unsigned int>(result.size()),
+ beta
+ );
+ else
+ compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_pure><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<unsigned int>(mat.handle3()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ static_cast<unsigned int>(mat.blocks1()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(vec.start()),
+ static_cast<unsigned int>(vec.stride()),
+ alpha,
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(result.start()),
+ static_cast<unsigned int>(result.stride()),
+ static_cast<unsigned int>(result.size()),
+ beta
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_adaptive_kernel");
+ }
+}
+
+/** @brief Helper struct for accessing an element of a row- or column-major matrix.
+ *
+ * @param LayoutT The layout tag: Either row_major or column_major
+ */
+template<typename LayoutT>
+struct mat_mult_matrix_index
+{
+ static __device__ unsigned int apply(unsigned int i, unsigned int j,
+ unsigned int row_start, unsigned int row_inc,
+ unsigned int col_start, unsigned int col_inc,
+ unsigned int internal_rows, unsigned int internal_cols)
+ {
+ return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
+ }
+};
+
+/** \cond */
+template<>
+struct mat_mult_matrix_index<viennacl::column_major>
+{
+ static __device__ unsigned int apply(unsigned int i, unsigned int j,
+ unsigned int row_start, unsigned int row_inc,
+ unsigned int col_start, unsigned int col_inc,
+ unsigned int internal_rows, unsigned int internal_cols)
+ {
+ return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
+ }
+};
+/** \endcond */
+
+
+template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
+__global__ void compressed_matrix_d_mat_mul_kernel(
+ const unsigned int * sp_mat_row_indices,
+ const unsigned int * sp_mat_col_indices,
+ const NumericT * sp_mat_elements,
+ const NumericT * d_mat,
+ unsigned int d_mat_row_start,
+ unsigned int d_mat_col_start,
+ unsigned int d_mat_row_inc,
+ unsigned int d_mat_col_inc,
+ unsigned int d_mat_row_size,
+ unsigned int d_mat_col_size,
+ unsigned int d_mat_internal_rows,
+ unsigned int d_mat_internal_cols,
+ NumericT * result,
+ unsigned int result_row_start,
+ unsigned int result_col_start,
+ unsigned int result_row_inc,
+ unsigned int result_col_inc,
+ unsigned int result_row_size,
+ unsigned int result_col_size,
+ unsigned int result_internal_rows,
+ unsigned int result_internal_cols)
+{
+ for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
+ {
+ unsigned int row_start = sp_mat_row_indices[row];
+ unsigned int row_end = sp_mat_row_indices[row+1];
+
+ for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
+ {
+ NumericT r = 0;
+
+ for (unsigned int k = row_start; k < row_end; k++)
+ {
+ unsigned int j = sp_mat_col_indices[k];
+ NumericT x = sp_mat_elements[k];
+ NumericT y = d_mat[ DMatIndexT::apply(j, col,
+ d_mat_row_start, d_mat_row_inc,
+ d_mat_col_start, d_mat_col_inc,
+ d_mat_internal_rows, d_mat_internal_cols) ];
+
+ r += x * y;
+ }
+
+ result[ResultIndexT::apply(row, col,
+ result_row_start, result_row_inc,
+ result_col_start, result_col_inc,
+ result_internal_rows, result_internal_cols)] = r;
+ }
+ }
+}
+
+
+/** @brief Carries out sparse_matrix-dense_matrix multiplication first matrix being compressed
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param sp_mat The sparse matrix
+* @param d_mat The dense matrix
+* @param result The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_base<NumericT> & d_mat,
+ viennacl::matrix_base<NumericT> & result)
+{
+ if (d_mat.row_major() && result.row_major())
+ {
+ compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+ viennacl::cuda_arg(d_mat),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+ }
+ else if (d_mat.row_major() && !result.row_major())
+ {
+ compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+ viennacl::cuda_arg(d_mat),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+ }
+ else if (!d_mat.row_major() && result.row_major())
+ {
+ compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+ viennacl::cuda_arg(d_mat),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+ }
+ else
+ {
+ compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+ viennacl::cuda_arg(d_mat),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
+ }
+}
+
+
+template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
+__global__ void compressed_matrix_d_tr_mat_mul_kernel(
+ const unsigned int * sp_mat_row_indices,
+ const unsigned int * sp_mat_col_indices,
+ const NumericT * sp_mat_elements,
+ const NumericT * d_mat,
+ unsigned int d_mat_row_start,
+ unsigned int d_mat_col_start,
+ unsigned int d_mat_row_inc,
+ unsigned int d_mat_col_inc,
+ unsigned int d_mat_row_size,
+ unsigned int d_mat_col_size,
+ unsigned int d_mat_internal_rows,
+ unsigned int d_mat_internal_cols,
+ NumericT * result,
+ unsigned int result_row_start,
+ unsigned int result_col_start,
+ unsigned int result_row_inc,
+ unsigned int result_col_inc,
+ unsigned int result_row_size,
+ unsigned int result_col_size,
+ unsigned int result_internal_rows,
+ unsigned int result_internal_cols)
+{
+ for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
+ {
+ unsigned int row_start = sp_mat_row_indices[row];
+ unsigned int row_end = sp_mat_row_indices[row+1];
+
+ for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
+ {
+ NumericT r = 0;
+
+ for (unsigned int k = row_start; k < row_end; k++)
+ {
+ unsigned int j = sp_mat_col_indices[k];
+ NumericT x = sp_mat_elements[k];
+ NumericT y = d_mat[ DMatIndexT::apply(col, j,
+ d_mat_row_start, d_mat_row_inc,
+ d_mat_col_start, d_mat_col_inc,
+ d_mat_internal_rows, d_mat_internal_cols) ];
+
+ r += x * y;
+ }
+
+ result [ ResultIndexT::apply(row, col,
+ result_row_start, result_row_inc,
+ result_col_start, result_col_inc,
+ result_internal_rows, result_internal_cols) ] = r;
+ }
+ }
+
+}
+
+/** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
+* and the second transposed
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat The sparse matrix
+* @param d_mat The transposed dense matrix proxy
+* @param result The result matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
+ const viennacl::matrix_base<NumericT>,
+ viennacl::op_trans > & d_mat,
+ viennacl::matrix_base<NumericT> & result)
+{
+
+ if (d_mat.lhs().row_major() && result.row_major())
+ {
+ compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+ viennacl::cuda_arg(d_mat.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+ }
+ else if (d_mat.lhs().row_major() && !result.row_major())
+ {
+ compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+ viennacl::cuda_arg(d_mat.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+ }
+ else if (!d_mat.lhs().row_major() && result.row_major())
+ {
+ compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+ viennacl::cuda_arg(d_mat.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+ }
+ else
+ {
+ compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+
+ viennacl::cuda_arg(d_mat.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
+ }
+}
+
+
+//
+// triangular solves for compressed_matrix
+//
+
+template<typename NumericT>
+__global__ void compressed_matrix_diagonal_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ NumericT * result,
+ unsigned int size)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < size;
+ row += gridDim.x * blockDim.x)
+ {
+ NumericT diag = NumericT(0);
+ unsigned int row_end = row_indices[row+1];
+ for (unsigned int i = row_indices[row]; i < row_end; ++i)
+ {
+ unsigned int col_index = column_indices[i];
+ if (col_index == row)
+ {
+ diag = elements[i];
+ break;
+ }
+ }
+ result[row] = diag;
+ }
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const SparseMatrixT & mat,
+ viennacl::vector_base<NumericT> & vec,
+ viennacl::linalg::unit_lower_tag)
+{
+ csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const SparseMatrixT & mat,
+ viennacl::vector_base<NumericT> & vec,
+ viennacl::linalg::lower_tag)
+{
+ csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
+}
+
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const SparseMatrixT & mat,
+ viennacl::vector_base<NumericT> & vec,
+ viennacl::linalg::unit_upper_tag)
+{
+ csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const SparseMatrixT & mat,
+ viennacl::vector_base<NumericT> & vec,
+ viennacl::linalg::upper_tag)
+{
+ csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
+}
+
+
+
+// transposed
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
+ viennacl::vector_base<NumericT> & vec,
+ viennacl::linalg::unit_lower_tag)
+{
+ csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+ viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.lhs().size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
+ viennacl::vector_base<NumericT> & vec,
+ viennacl::linalg::lower_tag)
+{
+ viennacl::vector<NumericT> diagonal(vec.size());
+
+ compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+ viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+ viennacl::cuda_arg(diagonal),
+ static_cast<unsigned int>(mat.size1())
+ );
+
+ csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+ viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+ viennacl::cuda_arg(diagonal),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.lhs().size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
+ viennacl::vector_base<NumericT> & vec,
+ viennacl::linalg::unit_upper_tag)
+{
+ csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+ viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.lhs().size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
+}
+
+
+/** @brief Carries out triangular inplace solves
+*
+* @param mat The matrix
+* @param vec The vector holding the right hand side. Is overwritten by the solution.
+*/
+template<typename SparseMatrixT, typename NumericT>
+typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
+inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
+ viennacl::vector_base<NumericT> & vec,
+ viennacl::linalg::upper_tag)
+{
+ viennacl::vector<NumericT> diagonal(vec.size());
+
+ compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+ viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+ viennacl::cuda_arg(diagonal),
+ static_cast<unsigned int>(mat.size1())
+ );
+
+ csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
+ viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
+ viennacl::cuda_arg(diagonal),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(mat.lhs().size1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
+}
+
+namespace detail
+{
+ //
+ // block solves
+ //
+ template<typename NumericT, unsigned int AlignmentV>
+ void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> & L,
+ viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+ vector_base<NumericT> const & /* L_diagonal */, //ignored
+ vector_base<NumericT> & vec,
+ viennacl::linalg::unit_lower_tag)
+ {
+ csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(L.lhs().handle1()),
+ viennacl::cuda_arg<unsigned int>(L.lhs().handle2()),
+ viennacl::cuda_arg<NumericT>(L.lhs().handle()),
+ viennacl::cuda_arg<unsigned int>(block_indices),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(L.lhs().size1())
+ );
+ }
+
+
+ template<typename NumericT, unsigned int AlignmentV>
+ void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
+ const compressed_matrix<NumericT, AlignmentV>,
+ op_trans> & U,
+ viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
+ vector_base<NumericT> const & U_diagonal,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::upper_tag)
+ {
+ csr_block_trans_lu_backward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(U.lhs().handle1()),
+ viennacl::cuda_arg<unsigned int>(U.lhs().handle2()),
+ viennacl::cuda_arg<NumericT>(U.lhs().handle()),
+ viennacl::cuda_arg(U_diagonal),
+ viennacl::cuda_arg<unsigned int>(block_indices),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(U.lhs().size1())
+ );
+ }
+
+
+}
+
+
+//
+// Compressed Compressed Matrix
+//
+
+template<typename NumericT>
+__global__ void compressed_compressed_matrix_vec_mul_kernel(
+ const unsigned int * row_jumper,
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ unsigned int nonzero_rows,
+ const NumericT * x,
+ unsigned int start_x,
+ unsigned int inc_x,
+ NumericT alpha,
+ NumericT * result,
+ unsigned int start_result,
+ unsigned int inc_result,
+ unsigned int size_result,
+ NumericT beta)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < nonzero_rows;
+ i += gridDim.x * blockDim.x)
+ {
+ NumericT dot_prod = NumericT(0);
+ unsigned int row_end = row_jumper[i+1];
+ for (unsigned int j = row_jumper[i]; j < row_end; ++j)
+ dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
+
+ unsigned int index = row_indices[i] * inc_result + start_result;
+ if (beta != 0) result[index] += alpha * dot_prod;
+ else result[index] = alpha * dot_prod;
+ }
+}
+
+
+/** @brief Carries out matrix-vector multiplication with a compressed_compressed_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT>
+void prod_impl(const viennacl::compressed_compressed_matrix<NumericT> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ if (beta < 0 || beta > 0)
+ viennacl::linalg::cuda::av(result, result, beta, 1, false, false);
+ else
+ result.clear();
+
+ compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
+ viennacl::cuda_arg<unsigned int>(mat.handle3()),
+ viennacl::cuda_arg<unsigned int>(mat.handle2()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ static_cast<unsigned int>(mat.nnz1()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(vec.start()),
+ static_cast<unsigned int>(vec.stride()),
+ alpha,
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(result.start()),
+ static_cast<unsigned int>(result.stride()),
+ static_cast<unsigned int>(result.size()),
+ beta
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
+}
+
+//
+// Coordinate Matrix
+//
+
+
+namespace detail
+{
+
+ template<typename NumericT>
+ __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
+ const NumericT * elements,
+ const unsigned int * group_boundaries,
+ NumericT * result,
+ unsigned int option)
+ {
+ __shared__ unsigned int shared_rows[128];
+ __shared__ NumericT inter_results[128];
+
+ uint2 tmp;
+ NumericT val;
+ unsigned int last_index = blockDim.x - 1;
+ unsigned int group_start = group_boundaries[blockIdx.x];
+ unsigned int group_end = group_boundaries[blockIdx.x + 1];
+ unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+ unsigned int local_index = 0;
+
+ for (unsigned int k = 0; k < k_end; ++k)
+ {
+ local_index = group_start + k * blockDim.x + threadIdx.x;
+
+ tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+ val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
+
+ //check for carry from previous loop run:
+ if (threadIdx.x == 0 && k > 0)
+ {
+ if (tmp.x == shared_rows[last_index])
+ {
+ switch (option)
+ {
+ case 0: //inf-norm
+ case 3: //diagonal entry
+ val = max(val, fabs(inter_results[last_index]));
+ break;
+
+ case 1: //1-norm
+ val = fabs(val) + inter_results[last_index];
+ break;
+
+ case 2: //2-norm
+ val = sqrt(val * val + inter_results[last_index]);
+ break;
+
+ default:
+ break;
+ }
+ }
+ else
+ {
+ switch (option)
+ {
+ case 0: //inf-norm
+ case 1: //1-norm
+ case 3: //diagonal entry
+ result[shared_rows[last_index]] = inter_results[last_index];
+ break;
+
+ case 2: //2-norm
+ result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
+ default:
+ break;
+ }
+ }
+ }
+
+ //segmented parallel reduction begin
+ __syncthreads();
+ shared_rows[threadIdx.x] = tmp.x;
+ switch (option)
+ {
+ case 0:
+ case 3:
+ inter_results[threadIdx.x] = val;
+ break;
+ case 1:
+ inter_results[threadIdx.x] = fabs(val);
+ break;
+ case 2:
+ inter_results[threadIdx.x] = val * val;
+ default:
+ break;
+ }
+ __syncthreads();
+
+ for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+ {
+ NumericT left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+ __syncthreads();
+ switch (option)
+ {
+ case 0: //inf-norm
+ case 3: //diagonal entry
+ inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
+ break;
+
+ case 1: //1-norm
+ inter_results[threadIdx.x] += left;
+ break;
+
+ case 2: //2-norm
+ inter_results[threadIdx.x] += left;
+ break;
+
+ default:
+ break;
+ }
+ __syncthreads();
+ }
+ //segmented parallel reduction end
+
+ if (threadIdx.x != last_index &&
+ shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
+ inter_results[threadIdx.x] != 0)
+ {
+ result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
+ }
+
+ __syncthreads();
+ } //for k
+
+ if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
+ result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
+ }
+
+ template<typename NumericT, unsigned int AlignmentV>
+ void row_info(coordinate_matrix<NumericT, AlignmentV> const & mat,
+ vector_base<NumericT> & vec,
+ viennacl::linalg::detail::row_info_types info_selector)
+ {
+ coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg<unsigned int>(mat.handle3()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(info_selector)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
+ }
+
+} //namespace detail
+
+
+template<typename NumericT>
+__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+ const NumericT * elements,
+ const unsigned int * group_boundaries,
+ const NumericT * x,
+ unsigned int start_x,
+ unsigned int inc_x,
+ NumericT alpha,
+ NumericT * result,
+ unsigned int start_result,
+ unsigned int inc_result,
+ NumericT beta)
+{
+ __shared__ unsigned int shared_rows[128];
+ __shared__ NumericT inter_results[128];
+
+ uint2 tmp;
+ NumericT val;
+ unsigned int group_start = group_boundaries[blockIdx.x];
+ unsigned int group_end = group_boundaries[blockIdx.x + 1];
+ unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+ unsigned int local_index = 0;
+
+ for (unsigned int k = 0; k < k_end; ++k)
+ {
+ local_index = group_start + k * blockDim.x + threadIdx.x;
+
+ tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+ val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
+
+ //check for carry from previous loop run:
+ if (threadIdx.x == 0 && k > 0)
+ {
+ if (tmp.x == shared_rows[blockDim.x-1])
+ val += inter_results[blockDim.x-1];
+ else if (beta != 0)
+ result[shared_rows[blockDim.x-1] * inc_result + start_result] += alpha * inter_results[blockDim.x-1];
+ else
+ result[shared_rows[blockDim.x-1] * inc_result + start_result] = alpha * inter_results[blockDim.x-1];
+ }
+
+ //segmented parallel reduction begin
+ __syncthreads();
+ shared_rows[threadIdx.x] = tmp.x;
+ inter_results[threadIdx.x] = val;
+ NumericT left = 0;
+ __syncthreads();
+
+ for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+ {
+ left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+ __syncthreads();
+ inter_results[threadIdx.x] += left;
+ __syncthreads();
+ }
+ //segmented parallel reduction end
+
+ if (local_index < group_end - 1 && threadIdx.x < blockDim.x-1 &&
+ shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+ {
+ if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
+ else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x];
+ }
+
+ __syncthreads();
+ } //for k
+
+ if (local_index + 1 == group_end) {
+ if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
+ else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x];
+ }
+}
+
+
+/** @brief Carries out matrix-vector multiplication with a coordinate_matrix
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & mat,
+ const viennacl::vector_base<NumericT> & vec,
+ NumericT alpha,
+ viennacl::vector_base<NumericT> & result,
+ NumericT beta)
+{
+ if (beta < 0 || beta > 0)
+ viennacl::linalg::cuda::av(result, result, beta, 1, false, false);
+ else
+ result.clear();
+
+ coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
+ viennacl::cuda_arg<NumericT>(mat.handle()),
+ viennacl::cuda_arg<unsigned int>(mat.handle3()),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(vec.start()),
+ static_cast<unsigned int>(vec.stride()),
+ alpha,
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(result.start()),
+ static_cast<unsigned int>(result.stride()),
+ beta
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
+}
+
+
+
+
+template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
+__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+ const NumericT * elements,
+ const unsigned int * group_boundaries,
+ const NumericT * d_mat,
+ unsigned int d_mat_row_start,
+ unsigned int d_mat_col_start,
+ unsigned int d_mat_row_inc,
+ unsigned int d_mat_col_inc,
+ unsigned int d_mat_row_size,
+ unsigned int d_mat_col_size,
+ unsigned int d_mat_internal_rows,
+ unsigned int d_mat_internal_cols,
+ NumericT * result,
+ unsigned int result_row_start,
+ unsigned int result_col_start,
+ unsigned int result_row_inc,
+ unsigned int result_col_inc,
+ unsigned int result_row_size,
+ unsigned int result_col_size,
+ unsigned int result_internal_rows,
+ unsigned int result_internal_cols)
+{
+ __shared__ unsigned int shared_rows[128];
+ __shared__ NumericT inter_results[128];
+
+ uint2 tmp;
+ NumericT val;
+ unsigned int group_start = group_boundaries[blockIdx.x];
+ unsigned int group_end = group_boundaries[blockIdx.x + 1];
+ unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+ unsigned int local_index = 0;
+
+ for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
+ {
+ for (unsigned int k = 0; k < k_end; ++k)
+ {
+ local_index = group_start + k * blockDim.x + threadIdx.x;
+
+ tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+ val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
+ d_mat_row_start, d_mat_row_inc,
+ d_mat_col_start, d_mat_col_inc,
+ d_mat_internal_rows, d_mat_internal_cols) ] : 0;
+
+ //check for carry from previous loop run:
+ if (threadIdx.x == 0 && k > 0)
+ {
+ if (tmp.x == shared_rows[blockDim.x-1])
+ val += inter_results[blockDim.x-1];
+ else
+ result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
+ result_row_start, result_row_inc,
+ result_col_start, result_col_inc,
+ result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
+ }
+
+ //segmented parallel reduction begin
+ __syncthreads();
+ shared_rows[threadIdx.x] = tmp.x;
+ inter_results[threadIdx.x] = val;
+ NumericT left = 0;
+ __syncthreads();
+
+ for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+ {
+ left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+ __syncthreads();
+ inter_results[threadIdx.x] += left;
+ __syncthreads();
+ }
+ //segmented parallel reduction end
+
+ if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+ shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+ {
+ result[ResultIndexT::apply(tmp.x, result_col,
+ result_row_start, result_row_inc,
+ result_col_start, result_col_inc,
+ result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
+ }
+
+ __syncthreads();
+ } //for k
+
+ if (local_index + 1 == group_end)
+ result[ResultIndexT::apply(tmp.x, result_col,
+ result_row_start, result_row_inc,
+ result_col_start, result_col_inc,
+ result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
+ }
+}
+
+
+/** @brief Carries out Compressed Matrix(COO)-Dense Matrix multiplication
+*
+* Implementation of the convenience expression result = prod(sp_mat, d_mat);
+*
+* @param sp_mat The Sparse Matrix (Coordinate format)
+* @param d_mat The Dense Matrix
+* @param result The Result Matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,
+ const viennacl::matrix_base<NumericT> & d_mat,
+ viennacl::matrix_base<NumericT> & result)
+{
+ if (d_mat.row_major() && result.row_major())
+ {
+ coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
+
+ viennacl::cuda_arg(d_mat),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
+ }
+ else if (d_mat.row_major() && !result.row_major())
+ {
+ coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
+
+ viennacl::cuda_arg(d_mat),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
+ }
+ else if (!d_mat.row_major() && result.row_major())
+ {
+ coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
+
+ viennacl::cuda_arg(d_mat),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
+ }
+ else
+ {
+ coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
+ (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
+ viennacl::cuda_arg<NumericT>(sp_mat.handle()),
+ viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
+
+ viennacl::cuda_arg(d_mat),
+ static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
+
+ viennacl::cuda_arg(result),
+ static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
+ static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
+ static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
+ }
+
+}
+
+template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
+__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+ const NumericT * elements,
+ const unsigned int * group_boundaries,
+ const NumericT * d_mat,
+ unsigned int d_mat_row_start,
+ unsigned int d_mat_col_start,
+ unsigned int d_mat_row_inc,
+ unsigned int d_mat_col_inc,
+ unsigned int d_mat_row_size,
+ unsigned int d_mat_col_size,
+ unsigned int d_mat_internal_rows,
+ unsigned int d_mat_internal_cols,
+ NumericT * result,
+ unsigned int result_row_start,
+ unsigned int result_col_start,
+ unsigned int result_row_inc,
+ unsigned int result_col_inc,
+ unsigned int result_row_size,
+ unsigned int result_col_size,
+ unsigned int result_internal_rows,
+ unsigned int result_internal_cols)
+{
+ __shared__ unsigned int shared_rows[128];
+ __shared__ NumericT inter_results[128];
+
+ uint2 tmp;
+ NumericT val;
+ unsigned int group_start = group_boundaries[blockIdx.x];
+ unsigned int group_end = group_boundaries[blockIdx.x + 1];
+ unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+ uns
<TRUNCATED>
[21/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp
new file mode 100644
index 0000000..ee6626c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/iterative_operations.hpp
@@ -0,0 +1,880 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/iterative_operations.hpp
+ @brief Implementations of specialized kernels for fast iterative solvers using OpenMP on the CPU
+*/
+
+#include <cmath>
+#include <algorithm> //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
+ #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+namespace detail
+{
+ /** @brief Implementation of a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT>
+ void pipelined_prod_impl(compressed_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ NumericT const * r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ typedef NumericT value_type;
+
+ value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);
+ value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);
+ value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
+ unsigned int const * row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+ value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+ value_type inner_prod_ApAp = 0;
+ value_type inner_prod_pAp = 0;
+ value_type inner_prod_Ap_r0star = 0;
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ value_type dot_prod = 0;
+ value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
+
+ vcl_size_t row_end = row_buffer[row+1];
+ for (vcl_size_t i = row_buffer[row]; i < row_end; ++i)
+ dot_prod += elements[i] * p_buf[col_buffer[i]];
+
+ // update contributions for the inner products (Ap, Ap) and (p, Ap)
+ Ap_buf[static_cast<vcl_size_t>(row)] = dot_prod;
+ inner_prod_ApAp += dot_prod * dot_prod;
+ inner_prod_pAp += val_p_diag * dot_prod;
+ inner_prod_Ap_r0star += r0star ? dot_prod * r0star[static_cast<vcl_size_t>(row)] : value_type(0);
+ }
+
+ data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
+ data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+ if (r0star)
+ data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+ }
+
+
+
+ /** @brief Implementation of a fused matrix-vector product with a coordinate_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT>
+ void pipelined_prod_impl(coordinate_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ NumericT const * r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ typedef NumericT value_type;
+
+ value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
+ value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
+ value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
+ unsigned int const * coord_buffer = detail::extract_raw_pointer<unsigned int>(A.handle12());
+ value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+ // flush result buffer (cannot be expected to be zero)
+ for (vcl_size_t i = 0; i< Ap.size(); ++i)
+ Ap_buf[i] = 0;
+
+ // matrix-vector product with a general COO format
+ for (vcl_size_t i = 0; i < A.nnz(); ++i)
+ Ap_buf[coord_buffer[2*i]] += elements[i] * p_buf[coord_buffer[2*i+1]];
+
+ // computing the inner products (Ap, Ap) and (p, Ap):
+ // Note: The COO format does not allow to inject the subsequent operations into the matrix-vector product, because row and column ordering assumptions are too weak
+ value_type inner_prod_ApAp = 0;
+ value_type inner_prod_pAp = 0;
+ value_type inner_prod_Ap_r0star = 0;
+ for (vcl_size_t i = 0; i<Ap.size(); ++i)
+ {
+ NumericT value_Ap = Ap_buf[i];
+ NumericT value_p = p_buf[i];
+
+ inner_prod_ApAp += value_Ap * value_Ap;
+ inner_prod_pAp += value_Ap * value_p;
+ inner_prod_Ap_r0star += r0star ? value_Ap * r0star[i] : value_type(0);
+ }
+
+ data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
+ data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+ if (r0star)
+ data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+ }
+
+
+ /** @brief Implementation of a fused matrix-vector product with an ell_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT>
+ void pipelined_prod_impl(ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ NumericT const * r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ typedef NumericT value_type;
+
+ value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
+ value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
+ value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
+ unsigned int const * coords = detail::extract_raw_pointer<unsigned int>(A.handle2());
+ value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+ value_type inner_prod_ApAp = 0;
+ value_type inner_prod_pAp = 0;
+ value_type inner_prod_Ap_r0star = 0;
+ for (vcl_size_t row = 0; row < A.size1(); ++row)
+ {
+ value_type sum = 0;
+ value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
+
+ for (unsigned int item_id = 0; item_id < A.internal_maxnnz(); ++item_id)
+ {
+ vcl_size_t offset = row + item_id * A.internal_size1();
+ value_type val = elements[offset];
+
+ if (val)
+ sum += (p_buf[coords[offset]] * val);
+ }
+
+ Ap_buf[row] = sum;
+ inner_prod_ApAp += sum * sum;
+ inner_prod_pAp += val_p_diag * sum;
+ inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);
+ }
+
+ data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
+ data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+ if (r0star)
+ data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+ }
+
+
+ /** @brief Implementation of a fused matrix-vector product with an sliced_ell_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT, typename IndexT>
+ void pipelined_prod_impl(sliced_ell_matrix<NumericT, IndexT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ NumericT const * r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ typedef NumericT value_type;
+
+ value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
+ value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
+ value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
+ IndexT const * columns_per_block = detail::extract_raw_pointer<IndexT>(A.handle1());
+ IndexT const * column_indices = detail::extract_raw_pointer<IndexT>(A.handle2());
+ IndexT const * block_start = detail::extract_raw_pointer<IndexT>(A.handle3());
+ value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+ vcl_size_t num_blocks = A.size1() / A.rows_per_block() + 1;
+ std::vector<value_type> result_values(A.rows_per_block());
+
+ value_type inner_prod_ApAp = 0;
+ value_type inner_prod_pAp = 0;
+ value_type inner_prod_Ap_r0star = 0;
+ for (vcl_size_t block_idx = 0; block_idx < num_blocks; ++block_idx)
+ {
+ vcl_size_t current_columns_per_block = columns_per_block[block_idx];
+
+ for (vcl_size_t i=0; i<result_values.size(); ++i)
+ result_values[i] = 0;
+
+ for (IndexT column_entry_index = 0;
+ column_entry_index < current_columns_per_block;
+ ++column_entry_index)
+ {
+ vcl_size_t stride_start = block_start[block_idx] + column_entry_index * A.rows_per_block();
+ // Note: This for-loop may be unrolled by hand for exploiting vectorization
+ // Careful benchmarking recommended first, memory channels may be saturated already!
+ for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)
+ {
+ value_type val = elements[stride_start + row_in_block];
+
+ result_values[row_in_block] += val ? p_buf[column_indices[stride_start + row_in_block]] * val : 0;
+ }
+ }
+
+ vcl_size_t first_row_in_matrix = block_idx * A.rows_per_block();
+ for (IndexT row_in_block = 0; row_in_block < A.rows_per_block(); ++row_in_block)
+ {
+ vcl_size_t row = first_row_in_matrix + row_in_block;
+ if (row < Ap.size())
+ {
+ value_type row_result = result_values[row_in_block];
+
+ Ap_buf[row] = row_result;
+ inner_prod_ApAp += row_result * row_result;
+ inner_prod_pAp += p_buf[row] * row_result;
+ inner_prod_Ap_r0star += r0star ? row_result * r0star[row] : value_type(0);
+ }
+ }
+ }
+
+ data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
+ data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+ if (r0star)
+ data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+ }
+
+
+ /** @brief Implementation of a fused matrix-vector product with an hyb_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT>
+ void pipelined_prod_impl(hyb_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ NumericT const * r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ typedef NumericT value_type;
+ typedef unsigned int index_type;
+
+ value_type * Ap_buf = detail::extract_raw_pointer<value_type>(Ap.handle()) + viennacl::traits::start(Ap);;
+ value_type const * p_buf = detail::extract_raw_pointer<value_type>(p.handle()) + viennacl::traits::start(p);;
+ value_type const * elements = detail::extract_raw_pointer<value_type>(A.handle());
+ index_type const * coords = detail::extract_raw_pointer<index_type>(A.handle2());
+ value_type const * csr_elements = detail::extract_raw_pointer<value_type>(A.handle5());
+ index_type const * csr_row_buffer = detail::extract_raw_pointer<index_type>(A.handle3());
+ index_type const * csr_col_buffer = detail::extract_raw_pointer<index_type>(A.handle4());
+ value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+ value_type inner_prod_ApAp = 0;
+ value_type inner_prod_pAp = 0;
+ value_type inner_prod_Ap_r0star = 0;
+ for (vcl_size_t row = 0; row < A.size1(); ++row)
+ {
+ value_type val_p_diag = p_buf[static_cast<vcl_size_t>(row)]; //likely to be loaded from cache if required again in this row
+ value_type sum = 0;
+
+ //
+ // Part 1: Process ELL part
+ //
+ for (index_type item_id = 0; item_id < A.internal_ellnnz(); ++item_id)
+ {
+ vcl_size_t offset = row + item_id * A.internal_size1();
+ value_type val = elements[offset];
+
+ if (val)
+ sum += p_buf[coords[offset]] * val;
+ }
+
+ //
+ // Part 2: Process HYB part
+ //
+ vcl_size_t col_begin = csr_row_buffer[row];
+ vcl_size_t col_end = csr_row_buffer[row + 1];
+
+ for (vcl_size_t item_id = col_begin; item_id < col_end; item_id++)
+ sum += p_buf[csr_col_buffer[item_id]] * csr_elements[item_id];
+
+ Ap_buf[row] = sum;
+ inner_prod_ApAp += sum * sum;
+ inner_prod_pAp += val_p_diag * sum;
+ inner_prod_Ap_r0star += r0star ? sum * r0star[row] : value_type(0);
+ }
+
+ data_buffer[ buffer_chunk_size] = inner_prod_ApAp;
+ data_buffer[2 * buffer_chunk_size] = inner_prod_pAp;
+ if (r0star)
+ data_buffer[buffer_chunk_offset] = inner_prod_Ap_r0star;
+ }
+
+} // namespace detail
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for vectors 'result', 'p', 'r', 'Ap':
+ * result += alpha * p;
+ * r -= alpha * Ap;
+ * p = r + beta * p;
+ * and runs the parallel reduction stage for computing inner_prod(r,r)
+ */
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+ NumericT alpha,
+ vector_base<NumericT> & p,
+ vector_base<NumericT> & r,
+ vector_base<NumericT> const & Ap,
+ NumericT beta,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ typedef NumericT value_type;
+
+ value_type * data_result = detail::extract_raw_pointer<value_type>(result);
+ value_type * data_p = detail::extract_raw_pointer<value_type>(p);
+ value_type * data_r = detail::extract_raw_pointer<value_type>(r);
+ value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
+ value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+ // Note: Due to the special setting in CG, there is no need to check for sizes and strides
+ vcl_size_t size = viennacl::traits::size(result);
+
+ value_type inner_prod_r = 0;
+ for (long i = 0; i < static_cast<long>(size); ++i)
+ {
+ value_type value_p = data_p[static_cast<vcl_size_t>(i)];
+ value_type value_r = data_r[static_cast<vcl_size_t>(i)];
+
+
+ data_result[static_cast<vcl_size_t>(i)] += alpha * value_p;
+ value_r -= alpha * data_Ap[static_cast<vcl_size_t>(i)];
+ value_p = value_r + beta * value_p;
+ inner_prod_r += value_r * value_r;
+
+ data_p[static_cast<vcl_size_t>(i)] = value_p;
+ data_r[static_cast<vcl_size_t>(i)] = value_r;
+ }
+
+ data_buffer[0] = inner_prod_r;
+}
+
+
+/** @brief Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p' and 'Ap':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+ */
+template<typename NumericT>
+void pipelined_cg_prod(compressed_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ typedef NumericT const * PtrType;
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+
+
+/** @brief Performs a fused matrix-vector product with a coordinate_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p' and 'Ap':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+ */
+template<typename NumericT>
+void pipelined_cg_prod(coordinate_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ typedef NumericT const * PtrType;
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+
+/** @brief Performs a fused matrix-vector product with an ell_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p' and 'Ap':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+ */
+template<typename NumericT>
+void pipelined_cg_prod(ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ typedef NumericT const * PtrType;
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+
+/** @brief Performs a fused matrix-vector product with an sliced_ell_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p' and 'Ap':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+ */
+template<typename NumericT, typename IndexT>
+void pipelined_cg_prod(sliced_ell_matrix<NumericT, IndexT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ typedef NumericT const * PtrType;
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+
+
+
+/** @brief Performs a fused matrix-vector product with an hyb_matrix for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p' and 'Ap':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+ */
+template<typename NumericT>
+void pipelined_cg_prod(hyb_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ typedef NumericT const * PtrType;
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, PtrType(NULL), inner_prod_buffer, inner_prod_buffer.size() / 3, 0);
+}
+
+//////////////////////////
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm.
+ *
+ * This routines computes for vectors 's', 'r', 'Ap':
+ * s = r - alpha * Ap
+ * with alpha obtained from a reduction step on the 0th and the 3rd out of 6 chunks in inner_prod_buffer
+ * and runs the parallel reduction stage for computing inner_prod(s,s)
+ */
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+ vector_base<NumericT> & r,
+ vector_base<NumericT> const & Ap,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ typedef NumericT value_type;
+
+ value_type * data_s = detail::extract_raw_pointer<value_type>(s);
+ value_type * data_r = detail::extract_raw_pointer<value_type>(r);
+ value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
+ value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+ // Note: Due to the special setting in CG, there is no need to check for sizes and strides
+ vcl_size_t size = viennacl::traits::size(s);
+
+ // part 1: compute alpha:
+ value_type r_in_r0 = 0;
+ value_type Ap_in_r0 = 0;
+ for (vcl_size_t i=0; i<buffer_chunk_size; ++i)
+ {
+ r_in_r0 += data_buffer[i];
+ Ap_in_r0 += data_buffer[i + 3 * buffer_chunk_size];
+ }
+ value_type alpha = r_in_r0 / Ap_in_r0;
+
+ // part 2: s = r - alpha * Ap and first step in reduction for s:
+ value_type inner_prod_s = 0;
+ for (long i = 0; i < static_cast<long>(size); ++i)
+ {
+ value_type value_s = data_s[static_cast<vcl_size_t>(i)];
+
+ value_s = data_r[static_cast<vcl_size_t>(i)] - alpha * data_Ap[static_cast<vcl_size_t>(i)];
+ inner_prod_s += value_s * value_s;
+
+ data_s[static_cast<vcl_size_t>(i)] = value_s;
+ }
+
+ data_buffer[buffer_chunk_offset] = inner_prod_s;
+}
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm.
+ *
+ * x_{j+1} = x_j + alpha * p_j + omega * s_j
+ * r_{j+1} = s_j - omega * t_j
+ * p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
+ * and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
+ */
+ template<typename NumericT>
+ void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+ vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+ NumericT beta, vector_base<NumericT> const & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size)
+ {
+ typedef NumericT value_type;
+
+ value_type * data_result = detail::extract_raw_pointer<value_type>(result);
+ value_type * data_p = detail::extract_raw_pointer<value_type>(p);
+ value_type const * data_s = detail::extract_raw_pointer<value_type>(s);
+ value_type * data_residual = detail::extract_raw_pointer<value_type>(residual);
+ value_type const * data_As = detail::extract_raw_pointer<value_type>(As);
+ value_type const * data_Ap = detail::extract_raw_pointer<value_type>(Ap);
+ value_type const * data_r0star = detail::extract_raw_pointer<value_type>(r0star);
+ value_type * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+
+ vcl_size_t size = viennacl::traits::size(result);
+
+ value_type inner_prod_r_r0star = 0;
+ for (long i = 0; i < static_cast<long>(size); ++i)
+ {
+ vcl_size_t index = static_cast<vcl_size_t>(i);
+ value_type value_result = data_result[index];
+ value_type value_p = data_p[index];
+ value_type value_s = data_s[index];
+ value_type value_residual = data_residual[index];
+ value_type value_As = data_As[index];
+ value_type value_Ap = data_Ap[index];
+ value_type value_r0star = data_r0star[index];
+
+ value_result += alpha * value_p + omega * value_s;
+ value_residual = value_s - omega * value_As;
+ value_p = value_residual + beta * (value_p - omega * value_Ap);
+ inner_prod_r_r0star += value_residual * value_r0star;
+
+ data_result[index] = value_result;
+ data_residual[index] = value_residual;
+ data_p[index] = value_p;
+ }
+
+ (void)buffer_chunk_size; // not needed here, just silence compiler warning (unused variable)
+ data_buffer[0] = inner_prod_r_r0star;
+ }
+
+ /** @brief Performs a fused matrix-vector product with a compressed_matrix for an efficient pipelined BiCGStab algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT>
+ void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
+
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+ /** @brief Performs a fused matrix-vector product with a coordinate_matrix for an efficient pipelined BiCGStab algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT>
+ void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
+
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+ /** @brief Performs a fused matrix-vector product with an ell_matrix for an efficient pipelined BiCGStab algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT>
+ void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
+
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+ /** @brief Performs a fused matrix-vector product with a sliced_ell_matrix for an efficient pipelined BiCGStab algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT, typename IndexT>
+ void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT, IndexT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
+
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+ /** @brief Performs a fused matrix-vector product with a hyb_matrix for an efficient pipelined BiCGStab algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p', 'Ap', and 'r0':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap), inner_prod(Ap, r0)
+ */
+ template<typename NumericT>
+ void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+ {
+ NumericT const * data_r0star = detail::extract_raw_pointer<NumericT>(r0star);
+
+ viennacl::linalg::host_based::detail::pipelined_prod_impl(A, p, Ap, data_r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ }
+
+
+/////////////////////////////////////////////////////////////
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+ *
+ * This routines computes for vectors 'r', 'v_k':
+ * Second reduction step for ||v_k||
+ * v_k /= ||v_k||
+ * First reduction step for <r, v_k>
+ */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+ vector_base<T> const & residual,
+ vector_base<T> & R_buffer,
+ vcl_size_t offset_in_R,
+ vector_base<T> const & inner_prod_buffer,
+ vector_base<T> & r_dot_vk_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ typedef T value_type;
+
+ value_type * data_v_k = detail::extract_raw_pointer<value_type>(v_k);
+ value_type const * data_residual = detail::extract_raw_pointer<value_type>(residual);
+ value_type * data_R = detail::extract_raw_pointer<value_type>(R_buffer);
+ value_type const * data_buffer = detail::extract_raw_pointer<value_type>(inner_prod_buffer);
+ value_type * data_r_dot_vk = detail::extract_raw_pointer<value_type>(r_dot_vk_buffer);
+
+ // Note: Due to the special setting in GMRES, there is no need to check for sizes and strides
+ vcl_size_t size = viennacl::traits::size(v_k);
+ vcl_size_t vk_start = viennacl::traits::start(v_k);
+
+ // part 1: compute alpha:
+ value_type norm_vk = 0;
+ for (vcl_size_t i=0; i<buffer_chunk_size; ++i)
+ norm_vk += data_buffer[i + buffer_chunk_size];
+ norm_vk = std::sqrt(norm_vk);
+ data_R[offset_in_R] = norm_vk;
+
+ // Compute <r, v_k> after normalization of v_k:
+ value_type inner_prod_r_dot_vk = 0;
+ for (long i = 0; i < static_cast<long>(size); ++i)
+ {
+ value_type value_vk = data_v_k[static_cast<vcl_size_t>(i) + vk_start] / norm_vk;
+
+ inner_prod_r_dot_vk += data_residual[static_cast<vcl_size_t>(i)] * value_vk;
+
+ data_v_k[static_cast<vcl_size_t>(i) + vk_start] = value_vk;
+ }
+
+ data_r_dot_vk[buffer_chunk_offset] = inner_prod_r_dot_vk;
+}
+
+
+
+/** @brief Computes first reduction stage for multiple inner products <v_i, v_k>, i=0..k-1
+ *
+ * All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+ */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vcl_size_t k,
+ vector_base<T> & vi_in_vk_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ typedef T value_type;
+
+ value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
+ value_type * data_inner_prod = detail::extract_raw_pointer<value_type>(vi_in_vk_buffer);
+
+ // reset buffer:
+ for (vcl_size_t j = 0; j < k; ++j)
+ data_inner_prod[j*buffer_chunk_size] = value_type(0);
+
+ // compute inner products:
+ for (vcl_size_t i = 0; i < v_k_size; ++i)
+ {
+ value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];
+
+ for (vcl_size_t j = 0; j < k; ++j)
+ data_inner_prod[j*buffer_chunk_size] += data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size] * value_vk;
+ }
+}
+
+
+/** @brief Computes the second reduction stage for multiple inner products <v_i, v_k>, i=0..k-1, then updates v_k -= <v_i, v_k> v_i and computes the first reduction stage for ||v_k||
+ *
+ * All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+ */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vcl_size_t k,
+ vector_base<T> const & vi_in_vk_buffer,
+ vector_base<T> & R_buffer,
+ vcl_size_t krylov_dim,
+ vector_base<T> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ typedef T value_type;
+
+ value_type * data_krylov_basis = detail::extract_raw_pointer<value_type>(device_krylov_basis);
+
+ std::vector<T> values_vi_in_vk(k);
+
+ // Step 1: Finish reduction of <v_i, v_k> to obtain scalars:
+ for (std::size_t i=0; i<k; ++i)
+ for (vcl_size_t j=0; j<buffer_chunk_size; ++j)
+ values_vi_in_vk[i] += vi_in_vk_buffer[i*buffer_chunk_size + j];
+
+
+ // Step 2: Compute v_k -= <v_i, v_k> v_i and reduction on ||v_k||:
+ value_type norm_vk = 0;
+ for (vcl_size_t i = 0; i < v_k_size; ++i)
+ {
+ value_type value_vk = data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size];
+
+ for (vcl_size_t j = 0; j < k; ++j)
+ value_vk -= values_vi_in_vk[j] * data_krylov_basis[static_cast<vcl_size_t>(i) + j * v_k_internal_size];
+
+ norm_vk += value_vk * value_vk;
+ data_krylov_basis[static_cast<vcl_size_t>(i) + k * v_k_internal_size] = value_vk;
+ }
+
+ // Step 3: Write values to R_buffer:
+ for (std::size_t i=0; i<k; ++i)
+ R_buffer[i + k * krylov_dim] = values_vi_in_vk[i];
+
+ inner_prod_buffer[buffer_chunk_size] = norm_vk;
+}
+
+/** @brief Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1} */
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+ vector_base<T> const & residual,
+ vector_base<T> const & krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vector_base<T> const & coefficients,
+ vcl_size_t k)
+{
+ typedef T value_type;
+
+ value_type * data_result = detail::extract_raw_pointer<value_type>(result);
+ value_type const * data_residual = detail::extract_raw_pointer<value_type>(residual);
+ value_type const * data_krylov_basis = detail::extract_raw_pointer<value_type>(krylov_basis);
+ value_type const * data_coefficients = detail::extract_raw_pointer<value_type>(coefficients);
+
+ for (vcl_size_t i = 0; i < v_k_size; ++i)
+ {
+ value_type value_result = data_result[i];
+
+ value_result += data_coefficients[0] * data_residual[i];
+ for (vcl_size_t j = 1; j<k; ++j)
+ value_result += data_coefficients[j] * data_krylov_basis[i + (j-1) * v_k_internal_size];
+
+ data_result[i] = value_result;
+ }
+
+}
+
+// Reuse implementation from CG:
+template <typename MatrixType, typename T>
+void pipelined_gmres_prod(MatrixType const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+}
+
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[26/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
new file mode 100644
index 0000000..93b0cba
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
@@ -0,0 +1,263 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
+#define VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/common.hpp
+ @brief Common routines used within ILU-type preconditioners
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/misc_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+
+//
+// Level Scheduling Setup for ILU:
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+ viennacl::vector<NumericT> const & diagonal_LU,
+ std::list<viennacl::backend::mem_handle> & row_index_arrays,
+ std::list<viennacl::backend::mem_handle> & row_buffers,
+ std::list<viennacl::backend::mem_handle> & col_buffers,
+ std::list<viennacl::backend::mem_handle> & element_buffers,
+ std::list<vcl_size_t> & row_elimination_num_list,
+ bool setup_U)
+{
+ NumericT const * diagonal_buf = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(diagonal_LU.handle());
+ NumericT const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(LU.handle());
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+
+ //
+ // Step 1: Determine row elimination order for each row and build up meta information about the number of entries taking part in each elimination step:
+ //
+ std::vector<vcl_size_t> row_elimination(LU.size1());
+ std::map<vcl_size_t, std::map<vcl_size_t, vcl_size_t> > row_entries_per_elimination_step;
+
+ vcl_size_t max_elimination_runs = 0;
+ for (vcl_size_t row2 = 0; row2 < LU.size1(); ++row2)
+ {
+ vcl_size_t row = setup_U ? (LU.size1() - row2) - 1 : row2;
+
+ vcl_size_t row_begin = row_buffer[row];
+ vcl_size_t row_end = row_buffer[row+1];
+ vcl_size_t elimination_index = 0; //Note: first run corresponds to elimination_index = 1 (otherwise, type issues with int <-> unsigned int would arise
+ for (vcl_size_t i = row_begin; i < row_end; ++i)
+ {
+ unsigned int col = col_buffer[i];
+ if ( (!setup_U && col < row) || (setup_U && col > row) )
+ {
+ elimination_index = std::max<vcl_size_t>(elimination_index, row_elimination[col]);
+ row_entries_per_elimination_step[row_elimination[col]][row] += 1;
+ }
+ }
+ row_elimination[row] = elimination_index + 1;
+ max_elimination_runs = std::max<vcl_size_t>(max_elimination_runs, elimination_index + 1);
+ }
+
+ //std::cout << "Number of elimination runs: " << max_elimination_runs << std::endl;
+
+ //
+ // Step 2: Build row-major elimination matrix for each elimination step
+ //
+
+ //std::cout << "Elimination order: " << std::endl;
+ //for (vcl_size_t i=0; i<row_elimination.size(); ++i)
+ // std::cout << row_elimination[i] << ", ";
+ //std::cout << std::endl;
+
+ //vcl_size_t summed_rows = 0;
+ for (vcl_size_t elimination_run = 1; elimination_run <= max_elimination_runs; ++elimination_run)
+ {
+ std::map<vcl_size_t, vcl_size_t> const & current_elimination_info = row_entries_per_elimination_step[elimination_run];
+
+ // count cols and entries handled in this elimination step
+ vcl_size_t num_tainted_cols = current_elimination_info.size();
+ vcl_size_t num_entries = 0;
+
+ for (std::map<vcl_size_t, vcl_size_t>::const_iterator it = current_elimination_info.begin();
+ it != current_elimination_info.end();
+ ++it)
+ num_entries += it->second;
+
+ //std::cout << "num_entries: " << num_entries << std::endl;
+ //std::cout << "num_tainted_cols: " << num_tainted_cols << std::endl;
+
+ if (num_tainted_cols > 0)
+ {
+ row_index_arrays.push_back(viennacl::backend::mem_handle());
+ viennacl::backend::switch_memory_context<unsigned int>(row_index_arrays.back(), viennacl::traits::context(LU));
+ viennacl::backend::typesafe_host_array<unsigned int> elim_row_index_array(row_index_arrays.back(), num_tainted_cols);
+
+ row_buffers.push_back(viennacl::backend::mem_handle());
+ viennacl::backend::switch_memory_context<unsigned int>(row_buffers.back(), viennacl::traits::context(LU));
+ viennacl::backend::typesafe_host_array<unsigned int> elim_row_buffer(row_buffers.back(), num_tainted_cols + 1);
+
+ col_buffers.push_back(viennacl::backend::mem_handle());
+ viennacl::backend::switch_memory_context<unsigned int>(col_buffers.back(), viennacl::traits::context(LU));
+ viennacl::backend::typesafe_host_array<unsigned int> elim_col_buffer(col_buffers.back(), num_entries);
+
+ element_buffers.push_back(viennacl::backend::mem_handle());
+ viennacl::backend::switch_memory_context<NumericT>(element_buffers.back(), viennacl::traits::context(LU));
+ std::vector<NumericT> elim_elements_buffer(num_entries);
+
+ row_elimination_num_list.push_back(num_tainted_cols);
+
+ vcl_size_t k=0;
+ vcl_size_t nnz_index = 0;
+ elim_row_buffer.set(0, 0);
+
+ for (std::map<vcl_size_t, vcl_size_t>::const_iterator it = current_elimination_info.begin();
+ it != current_elimination_info.end();
+ ++it)
+ {
+ //vcl_size_t col = setup_U ? (elimination_matrix.size() - it->first) - 1 : col2;
+ vcl_size_t row = it->first;
+ elim_row_index_array.set(k, row);
+
+ vcl_size_t row_begin = row_buffer[row];
+ vcl_size_t row_end = row_buffer[row+1];
+ for (vcl_size_t i = row_begin; i < row_end; ++i)
+ {
+ unsigned int col = col_buffer[i];
+ if ( (!setup_U && col < row) || (setup_U && col > row) ) //entry of L/U
+ {
+ if (row_elimination[col] == elimination_run) // this entry is substituted in this run
+ {
+ elim_col_buffer.set(nnz_index, col);
+ elim_elements_buffer[nnz_index] = setup_U ? elements[i] / diagonal_buf[it->first] : elements[i];
+ ++nnz_index;
+ }
+ }
+ }
+
+ elim_row_buffer.set(++k, nnz_index);
+ }
+
+ //
+ // Wrap in memory_handles:
+ //
+ viennacl::backend::memory_create(row_index_arrays.back(), elim_row_index_array.raw_size(), viennacl::traits::context(row_index_arrays.back()), elim_row_index_array.get());
+ viennacl::backend::memory_create(row_buffers.back(), elim_row_buffer.raw_size(), viennacl::traits::context(row_buffers.back()), elim_row_buffer.get());
+ viennacl::backend::memory_create(col_buffers.back(), elim_col_buffer.raw_size(), viennacl::traits::context(col_buffers.back()), elim_col_buffer.get());
+ viennacl::backend::memory_create(element_buffers.back(), sizeof(NumericT) * elim_elements_buffer.size(), viennacl::traits::context(element_buffers.back()), &(elim_elements_buffer[0]));
+ }
+
+ // Print some info:
+ //std::cout << "Eliminated columns in run " << elimination_run << ": " << num_tainted_cols << " (tainted columns: " << num_tainted_cols << ")" << std::endl;
+ //summed_rows += eliminated_rows_in_run;
+ //if (eliminated_rows_in_run == 0)
+ // break;
+ }
+ //std::cout << "Eliminated rows: " << summed_rows << " out of " << row_elimination.size() << std::endl;
+}
+
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_L(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+ viennacl::vector<NumericT> const & diagonal_LU,
+ std::list<viennacl::backend::mem_handle> & row_index_arrays,
+ std::list<viennacl::backend::mem_handle> & row_buffers,
+ std::list<viennacl::backend::mem_handle> & col_buffers,
+ std::list<viennacl::backend::mem_handle> & element_buffers,
+ std::list<vcl_size_t> & row_elimination_num_list)
+{
+ level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, false);
+}
+
+
+//
+// Multifrontal setup of U:
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_U(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+ viennacl::vector<NumericT> const & diagonal_LU,
+ std::list<viennacl::backend::mem_handle> & row_index_arrays,
+ std::list<viennacl::backend::mem_handle> & row_buffers,
+ std::list<viennacl::backend::mem_handle> & col_buffers,
+ std::list<viennacl::backend::mem_handle> & element_buffers,
+ std::list<vcl_size_t> & row_elimination_num_list)
+{
+ level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, true);
+}
+
+
+//
+// Multifrontal substitution (both L and U). Will partly be moved to single_threaded/opencl/cuda implementations
+//
+template<typename NumericT>
+void level_scheduling_substitute(viennacl::vector<NumericT> & vec,
+ std::list<viennacl::backend::mem_handle> const & row_index_arrays,
+ std::list<viennacl::backend::mem_handle> const & row_buffers,
+ std::list<viennacl::backend::mem_handle> const & col_buffers,
+ std::list<viennacl::backend::mem_handle> const & element_buffers,
+ std::list<vcl_size_t> const & row_elimination_num_list)
+{
+ typedef typename std::list< viennacl::backend::mem_handle >::const_iterator ListIterator;
+ ListIterator row_index_array_it = row_index_arrays.begin();
+ ListIterator row_buffers_it = row_buffers.begin();
+ ListIterator col_buffers_it = col_buffers.begin();
+ ListIterator element_buffers_it = element_buffers.begin();
+ typename std::list< vcl_size_t>::const_iterator row_elimination_num_it = row_elimination_num_list.begin();
+ for (vcl_size_t i=0; i<row_index_arrays.size(); ++i)
+ {
+ viennacl::linalg::detail::level_scheduling_substitute(vec, *row_index_array_it, *row_buffers_it, *col_buffers_it, *element_buffers_it, *row_elimination_num_it);
+
+ ++row_index_array_it;
+ ++row_buffers_it;
+ ++col_buffers_it;
+ ++element_buffers_it;
+ ++row_elimination_num_it;
+ }
+}
+
+
+
+
+
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+
+
+
+
+#endif
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
new file mode 100644
index 0000000..1c3191a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
@@ -0,0 +1,379 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILU0_HPP_
+#define VIENNACL_LINALG_DETAIL_ILU0_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/ilu0.hpp
+ @brief Implementations of incomplete factorization preconditioners with static nonzero pattern.
+
+ Contributed by Evan Bollig.
+
+ ILU0 (Incomplete LU with zero fill-in)
+ - All preconditioner nonzeros exist at locations that were nonzero in the input matrix.
+ - The number of nonzeros in the output preconditioner are exactly the same number as the input matrix
+
+ Evan Bollig 3/30/12
+
+ Adapted from viennacl/linalg/detail/ilut.hpp
+
+ Low-level reimplementation by Karl Rupp in Nov 2012, increasing performance substantially. Also added level-scheduling.
+
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU factorization with static pattern (ILU0)
+*/
+class ilu0_tag
+{
+public:
+ ilu0_tag(bool with_level_scheduling = false) : use_level_scheduling_(with_level_scheduling) {}
+
+ bool use_level_scheduling() const { return use_level_scheduling_; }
+ void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+
+private:
+ bool use_level_scheduling_;
+};
+
+
+/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+ *
+ * refer to the Algorithm in Saad's book (1996 edition)
+ *
+ * @param A The sparse matrix matrix. The result is directly written to A.
+ */
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> & A, ilu0_tag const & /* tag */)
+{
+ assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+ assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+ assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+
+ NumericT * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ // Note: Line numbers in the following refer to the algorithm in Saad's book
+
+ for (vcl_size_t i=1; i<A.size1(); ++i) // Line 1
+ {
+ unsigned int row_i_begin = row_buffer[i];
+ unsigned int row_i_end = row_buffer[i+1];
+ for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k) //Note: We do not assume that the column indices within a row are sorted
+ {
+ unsigned int k = col_buffer[buf_index_k];
+ if (k >= i)
+ continue; //Note: We do not assume that the column indices within a row are sorted
+
+ unsigned int row_k_begin = row_buffer[k];
+ unsigned int row_k_end = row_buffer[k+1];
+
+ // get a_kk:
+ NumericT a_kk = 0;
+ for (unsigned int buf_index_akk = row_k_begin; buf_index_akk < row_k_end; ++buf_index_akk)
+ {
+ if (col_buffer[buf_index_akk] == k)
+ {
+ a_kk = elements[buf_index_akk];
+ break;
+ }
+ }
+
+ NumericT & a_ik = elements[buf_index_k];
+ a_ik /= a_kk; //Line 3
+
+ for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j) //Note: We do not assume that the column indices within a row are sorted
+ {
+ unsigned int j = col_buffer[buf_index_j];
+ if (j <= k)
+ continue;
+
+ // determine a_kj:
+ NumericT a_kj = 0;
+ for (unsigned int buf_index_akj = row_k_begin; buf_index_akj < row_k_end; ++buf_index_akj)
+ {
+ if (col_buffer[buf_index_akj] == j)
+ {
+ a_kj = elements[buf_index_akj];
+ break;
+ }
+ }
+
+ //a_ij -= a_ik * a_kj
+ elements[buf_index_j] -= a_ik * a_kj; //Line 5
+ }
+ }
+ }
+
+}
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ilu0_precond
+{
+ typedef typename MatrixT::value_type NumericType;
+
+public:
+ ilu0_precond(MatrixT const & mat, ilu0_tag const & tag) : tag_(tag), LU_()
+ {
+ //initialize preconditioner:
+ //std::cout << "Start CPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End CPU precond" << std::endl;
+ }
+
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_.handle2());
+ NumericType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LU_.handle());
+
+ viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LU_.size2(), unit_lower_tag());
+ viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LU_.size2(), upper_tag());
+ }
+
+private:
+ void init(MatrixT const & mat)
+ {
+ viennacl::context host_context(viennacl::MAIN_MEMORY);
+ viennacl::switch_memory_context(LU_, host_context);
+
+ viennacl::copy(mat, LU_);
+ viennacl::linalg::precondition(LU_, tag_);
+ }
+
+ ilu0_tag tag_;
+ viennacl::compressed_matrix<NumericType> LU_;
+};
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ilu0_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+ typedef viennacl::compressed_matrix<NumericT, AlignmentV> MatrixType;
+
+public:
+ ilu0_precond(MatrixType const & mat, ilu0_tag const & tag)
+ : tag_(tag),
+ LU_(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+ {
+ //initialize preconditioner:
+ //std::cout << "Start GPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End GPU precond" << std::endl;
+ }
+
+ void apply(viennacl::vector<NumericT> & vec) const
+ {
+ viennacl::context host_context(viennacl::MAIN_MEMORY);
+ if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+ {
+ if (tag_.use_level_scheduling())
+ {
+ //std::cout << "Using multifrontal on GPU..." << std::endl;
+ detail::level_scheduling_substitute(vec,
+ multifrontal_L_row_index_arrays_,
+ multifrontal_L_row_buffers_,
+ multifrontal_L_col_buffers_,
+ multifrontal_L_element_buffers_,
+ multifrontal_L_row_elimination_num_list_);
+
+ vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+ detail::level_scheduling_substitute(vec,
+ multifrontal_U_row_index_arrays_,
+ multifrontal_U_row_buffers_,
+ multifrontal_U_col_buffers_,
+ multifrontal_U_element_buffers_,
+ multifrontal_U_row_elimination_num_list_);
+ }
+ else
+ {
+ viennacl::context old_context = viennacl::traits::context(vec);
+ viennacl::switch_memory_context(vec, host_context);
+ viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag());
+ viennacl::linalg::inplace_solve(LU_, vec, upper_tag());
+ viennacl::switch_memory_context(vec, old_context);
+ }
+ }
+ else //apply ILU0 directly on CPU
+ {
+ if (tag_.use_level_scheduling())
+ {
+ //std::cout << "Using multifrontal..." << std::endl;
+ detail::level_scheduling_substitute(vec,
+ multifrontal_L_row_index_arrays_,
+ multifrontal_L_row_buffers_,
+ multifrontal_L_col_buffers_,
+ multifrontal_L_element_buffers_,
+ multifrontal_L_row_elimination_num_list_);
+
+ vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+ detail::level_scheduling_substitute(vec,
+ multifrontal_U_row_index_arrays_,
+ multifrontal_U_row_buffers_,
+ multifrontal_U_col_buffers_,
+ multifrontal_U_element_buffers_,
+ multifrontal_U_row_elimination_num_list_);
+ }
+ else
+ {
+ viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag());
+ viennacl::linalg::inplace_solve(LU_, vec, upper_tag());
+ }
+ }
+ }
+
+ vcl_size_t levels() const { return multifrontal_L_row_index_arrays_.size(); }
+
+private:
+ void init(MatrixType const & mat)
+ {
+ viennacl::context host_context(viennacl::MAIN_MEMORY);
+ viennacl::switch_memory_context(LU_, host_context);
+ LU_ = mat;
+ viennacl::linalg::precondition(LU_, tag_);
+
+ if (!tag_.use_level_scheduling())
+ return;
+
+ // multifrontal part:
+ viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+ multifrontal_U_diagonal_.resize(LU_.size1(), false);
+ host_based::detail::row_info(LU_, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+ detail::level_scheduling_setup_L(LU_,
+ multifrontal_U_diagonal_, //dummy
+ multifrontal_L_row_index_arrays_,
+ multifrontal_L_row_buffers_,
+ multifrontal_L_col_buffers_,
+ multifrontal_L_element_buffers_,
+ multifrontal_L_row_elimination_num_list_);
+
+
+ detail::level_scheduling_setup_U(LU_,
+ multifrontal_U_diagonal_,
+ multifrontal_U_row_index_arrays_,
+ multifrontal_U_row_buffers_,
+ multifrontal_U_col_buffers_,
+ multifrontal_U_element_buffers_,
+ multifrontal_U_row_elimination_num_list_);
+
+ //
+ // Bring to device if necessary:
+ //
+
+ // L:
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_L_row_index_arrays_.begin();
+ it != multifrontal_L_row_index_arrays_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_L_row_buffers_.begin();
+ it != multifrontal_L_row_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_L_col_buffers_.begin();
+ it != multifrontal_L_col_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_L_element_buffers_.begin();
+ it != multifrontal_L_element_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+ // U:
+
+ viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_U_row_index_arrays_.begin();
+ it != multifrontal_U_row_index_arrays_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_U_row_buffers_.begin();
+ it != multifrontal_U_row_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_U_col_buffers_.begin();
+ it != multifrontal_U_col_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_U_element_buffers_.begin();
+ it != multifrontal_U_element_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+ }
+
+ ilu0_tag tag_;
+ viennacl::compressed_matrix<NumericT> LU_;
+
+ std::list<viennacl::backend::mem_handle> multifrontal_L_row_index_arrays_;
+ std::list<viennacl::backend::mem_handle> multifrontal_L_row_buffers_;
+ std::list<viennacl::backend::mem_handle> multifrontal_L_col_buffers_;
+ std::list<viennacl::backend::mem_handle> multifrontal_L_element_buffers_;
+ std::list<vcl_size_t> multifrontal_L_row_elimination_num_list_;
+
+ viennacl::vector<NumericT> multifrontal_U_diagonal_;
+ std::list<viennacl::backend::mem_handle> multifrontal_U_row_index_arrays_;
+ std::list<viennacl::backend::mem_handle> multifrontal_U_row_buffers_;
+ std::list<viennacl::backend::mem_handle> multifrontal_U_col_buffers_;
+ std::list<viennacl::backend::mem_handle> multifrontal_U_element_buffers_;
+ std::list<vcl_size_t> multifrontal_U_row_elimination_num_list_;
+
+};
+
+} // namespace linalg
+} // namespace viennacl
+
+
+#endif
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
new file mode 100644
index 0000000..11ab842
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
@@ -0,0 +1,597 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILUT_HPP_
+#define VIENNACL_LINALG_DETAIL_ILUT_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/ilut.hpp
+ @brief Implementations of an incomplete factorization preconditioner with threshold (ILUT)
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU factorization with threshold (ILUT)
+*/
+class ilut_tag
+{
+ public:
+ /** @brief The constructor.
+ *
+ * @param entries_per_row Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
+ * @param drop_tolerance The drop tolerance for ILUT
+ * @param with_level_scheduling Flag for enabling level scheduling on GPUs.
+ */
+ ilut_tag(unsigned int entries_per_row = 20,
+ double drop_tolerance = 1e-4,
+ bool with_level_scheduling = false)
+ : entries_per_row_(entries_per_row),
+ drop_tolerance_(drop_tolerance),
+ use_level_scheduling_(with_level_scheduling) {}
+
+ void set_drop_tolerance(double tol)
+ {
+ if (tol > 0)
+ drop_tolerance_ = tol;
+ }
+ double get_drop_tolerance() const { return drop_tolerance_; }
+
+ void set_entries_per_row(unsigned int e)
+ {
+ if (e > 0)
+ entries_per_row_ = e;
+ }
+
+ unsigned int get_entries_per_row() const { return entries_per_row_; }
+
+ bool use_level_scheduling() const { return use_level_scheduling_; }
+ void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+
+ private:
+ unsigned int entries_per_row_;
+ double drop_tolerance_;
+ bool use_level_scheduling_;
+};
+
+
+namespace detail
+{
+ /** @brief Helper struct for holding a sparse vector in linear memory. For internal use only.
+ *
+ * Unfortunately, the 'naive' implementation using a std::map<> is almost always too slow.
+ *
+ */
+ template<typename NumericT>
+ struct ilut_sparse_vector
+ {
+ ilut_sparse_vector(vcl_size_t alloc_size = 0) : size_(0), col_indices_(alloc_size), elements_(alloc_size) {}
+
+ void resize_if_bigger(vcl_size_t s)
+ {
+ if (s > elements_.size())
+ {
+ col_indices_.resize(s);
+ elements_.resize(s);
+ }
+ size_ = s;
+ }
+
+ vcl_size_t size_;
+ std::vector<unsigned int> col_indices_;
+ std::vector<NumericT> elements_;
+ };
+
+ /** @brief Subtracts a scaled sparse vector u from a sparse vector w and writes the output to z: z = w - alpha * u
+ *
+ * Sparsity pattern of u and w are usually different.
+ *
+ * @return Length of new vector
+ */
+ template<typename IndexT, typename NumericT>
+ IndexT merge_subtract_sparse_rows(IndexT const * w_coords, NumericT const * w_elements, IndexT w_size,
+ IndexT const * u_coords, NumericT const * u_elements, IndexT u_size, NumericT alpha,
+ IndexT * z_coords, NumericT * z_elements)
+ {
+ IndexT index_w = 0;
+ IndexT index_u = 0;
+ IndexT index_z = 0;
+
+ while (1)
+ {
+ if (index_w < w_size && index_u < u_size)
+ {
+ if (w_coords[index_w] < u_coords[index_u])
+ {
+ z_coords[index_z] = w_coords[index_w];
+ z_elements[index_z++] = w_elements[index_w++];
+ }
+ else if (w_coords[index_w] == u_coords[index_u])
+ {
+ z_coords[index_z] = w_coords[index_w];
+ z_elements[index_z++] = w_elements[index_w++] - alpha * u_elements[index_u++];
+ }
+ else
+ {
+ z_coords[index_z] = u_coords[index_u];
+ z_elements[index_z++] = - alpha * u_elements[index_u++];
+ }
+ }
+ else if (index_w == w_size && index_u < u_size)
+ {
+ z_coords[index_z] = u_coords[index_u];
+ z_elements[index_z++] = - alpha * u_elements[index_u++];
+ }
+ else if (index_w < w_size && index_u == u_size)
+ {
+ z_coords[index_z] = w_coords[index_w];
+ z_elements[index_z++] = w_elements[index_w++];
+ }
+ else
+ return index_z;
+ }
+ }
+
+ template<typename SizeT, typename NumericT>
+ void insert_with_value_sort(std::vector<std::pair<SizeT, NumericT> > & map,
+ SizeT index, NumericT value)
+ {
+ NumericT abs_value = std::fabs(value);
+ if (abs_value > 0)
+ {
+ // find first element with smaller absolute value:
+ std::size_t first_smaller_index = 0;
+ while (first_smaller_index < map.size() && std::fabs(map[first_smaller_index].second) > abs_value)
+ ++first_smaller_index;
+
+ std::pair<SizeT, NumericT> tmp(index, value);
+ for (std::size_t j=first_smaller_index; j<map.size(); ++j)
+ std::swap(map[j], tmp);
+ }
+ }
+
+}
+
+/** @brief Implementation of a ILU-preconditioner with threshold. Optimized implementation for compressed_matrix.
+*
+* refer to Algorithm 10.6 by Saad's book (1996 edition)
+*
+* @param A The input matrix. Either a compressed_matrix or of type std::vector< std::map<T, U> >
+* @param L The output matrix for L.
+* @param U The output matrix for U.
+* @param tag An ilut_tag in order to dispatch among several other preconditioners.
+*/
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> const & A,
+ viennacl::compressed_matrix<NumericT> & L,
+ viennacl::compressed_matrix<NumericT> & U,
+ ilut_tag const & tag)
+{
+ assert(A.size1() == L.size1() && bool("Output matrix size mismatch") );
+ assert(A.size1() == U.size1() && bool("Output matrix size mismatch") );
+
+ L.reserve( tag.get_entries_per_row() * A.size1());
+ U.reserve((tag.get_entries_per_row() + 1) * A.size1());
+
+ vcl_size_t avg_nnz_per_row = static_cast<vcl_size_t>(A.nnz() / A.size1());
+ detail::ilut_sparse_vector<NumericT> w1(tag.get_entries_per_row() * (avg_nnz_per_row + 10));
+ detail::ilut_sparse_vector<NumericT> w2(tag.get_entries_per_row() * (avg_nnz_per_row + 10));
+ detail::ilut_sparse_vector<NumericT> * w_in = &w1;
+ detail::ilut_sparse_vector<NumericT> * w_out = &w2;
+ std::vector<NumericT> diagonal_U(A.size1());
+
+ NumericT const * elements_A = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * row_buffer_A = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * col_buffer_A = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ NumericT * elements_L = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.handle());
+ unsigned int * row_buffer_L = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.handle1()); row_buffer_L[0] = 0;
+ unsigned int * col_buffer_L = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+ NumericT * elements_U = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.handle());
+ unsigned int * row_buffer_U = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.handle1()); row_buffer_U[0] = 0;
+ unsigned int * col_buffer_U = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+ std::vector<std::pair<unsigned int, NumericT> > sorted_entries_L(tag.get_entries_per_row());
+ std::vector<std::pair<unsigned int, NumericT> > sorted_entries_U(tag.get_entries_per_row());
+
+ for (vcl_size_t i=0; i<viennacl::traits::size1(A); ++i) // Line 1
+ {
+ std::fill(sorted_entries_L.begin(), sorted_entries_L.end(), std::pair<unsigned int, NumericT>(0, NumericT(0)));
+ std::fill(sorted_entries_U.begin(), sorted_entries_U.end(), std::pair<unsigned int, NumericT>(0, NumericT(0)));
+
+ //line 2: set up w
+ w_in->resize_if_bigger(row_buffer_A[i+1] - row_buffer_A[i]);
+ NumericT row_norm = 0;
+ unsigned int k = 0;
+ for (unsigned int j = row_buffer_A[i]; j < row_buffer_A[i+1]; ++j, ++k)
+ {
+ w_in->col_indices_[k] = col_buffer_A[j];
+ NumericT entry = elements_A[j];
+ w_in->elements_[k] = entry;
+ row_norm += entry * entry;
+ }
+ row_norm = std::sqrt(row_norm);
+ NumericT tau_i = static_cast<NumericT>(tag.get_drop_tolerance()) * row_norm;
+
+ //line 3: Iterate over lower diagonal parts of A:
+ k = 0;
+ unsigned int current_col = (row_buffer_A[i+1] > row_buffer_A[i]) ? w_in->col_indices_[k] : static_cast<unsigned int>(i); // mind empty rows here!
+ while (current_col < i)
+ {
+ //line 4:
+ NumericT a_kk = diagonal_U[current_col];
+
+ NumericT w_k_entry = w_in->elements_[k] / a_kk;
+ w_in->elements_[k] = w_k_entry;
+
+ //lines 5,6: (dropping rule to w_k)
+ if ( std::fabs(w_k_entry) > tau_i)
+ {
+ //line 7:
+ unsigned int row_U_begin = row_buffer_U[current_col];
+ unsigned int row_U_end = row_buffer_U[current_col + 1];
+
+ if (row_U_end > row_U_begin)
+ {
+ w_out->resize_if_bigger(w_in->size_ + (row_U_end - row_U_begin) - 1);
+ w_out->size_ = detail::merge_subtract_sparse_rows(&(w_in->col_indices_[0]), &(w_in->elements_[0]), static_cast<unsigned int>(w_in->size_),
+ col_buffer_U + row_U_begin + 1, elements_U + row_U_begin + 1, (row_U_end - row_U_begin) - 1, w_k_entry,
+ &(w_out->col_indices_[0]), &(w_out->elements_[0])
+ );
+ ++k;
+ }
+ }
+ else // drop element
+ {
+ w_out->resize_if_bigger(w_in->size_ - 1);
+ for (unsigned int r = 0; r < k; ++r)
+ {
+ w_out->col_indices_[r] = w_in->col_indices_[r];
+ w_out->elements_[r] = w_in->elements_[r];
+ }
+ for (unsigned int r = k+1; r < w_in->size_; ++r)
+ {
+ w_out->col_indices_[r-1] = w_in->col_indices_[r];
+ w_out->elements_[r-1] = w_in->elements_[r];
+ }
+
+ // Note: No increment to k here, element was dropped!
+ }
+
+ // swap pointers to w1 and w2
+ std::swap(w_in, w_out);
+
+ // process next entry:
+ current_col = (k < w_in->size_) ? w_in->col_indices_[k] : static_cast<unsigned int>(i);
+ } // while()
+
+ // Line 10: Apply a dropping rule to w
+ // To do so, we write values to a temporary array
+ for (unsigned int r = 0; r < w_in->size_; ++r)
+ {
+ unsigned int col = w_in->col_indices_[r];
+ NumericT value = w_in->elements_[r];
+
+ if (col < i) // entry for L:
+ detail::insert_with_value_sort(sorted_entries_L, col, value);
+ else if (col == i) // do not drop diagonal element
+ {
+ diagonal_U[i] = value;
+ if (value <= 0 && value >= 0)
+ {
+ std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry computed to zero (" << value << ") in row " << i << "!" << std::endl;
+ throw zero_on_diagonal_exception("ILUT zero diagonal!");
+ }
+ }
+ else // entry for U:
+ detail::insert_with_value_sort(sorted_entries_U, col, value);
+ }
+
+ //Lines 10-12: Apply a dropping rule to w, write the largest p values to L and U
+ unsigned int offset_L = row_buffer_L[i];
+ std::sort(sorted_entries_L.begin(), sorted_entries_L.end());
+ for (unsigned int j=0; j<tag.get_entries_per_row(); ++j)
+ if (std::fabs(sorted_entries_L[j].second) > 0)
+ {
+ col_buffer_L[offset_L] = sorted_entries_L[j].first;
+ elements_L[offset_L] = sorted_entries_L[j].second;
+ ++offset_L;
+ }
+ row_buffer_L[i+1] = offset_L;
+
+ unsigned int offset_U = row_buffer_U[i];
+ col_buffer_U[offset_U] = static_cast<unsigned int>(i);
+ elements_U[offset_U] = diagonal_U[i];
+ ++offset_U;
+ std::sort(sorted_entries_U.begin(), sorted_entries_U.end());
+ for (unsigned int j=0; j<tag.get_entries_per_row(); ++j)
+ if (std::fabs(sorted_entries_U[j].second) > 0)
+ {
+ col_buffer_U[offset_U] = sorted_entries_U[j].first;
+ elements_U[offset_U] = sorted_entries_U[j].second;
+ ++offset_U;
+ }
+ row_buffer_U[i+1] = offset_U;
+
+ } //for i
+}
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ilut_precond
+{
+ typedef typename MatrixT::value_type NumericType;
+
+public:
+ ilut_precond(MatrixT const & mat, ilut_tag const & tag) : tag_(tag), L_(mat.size1(), mat.size2()), U_(mat.size1(), mat.size2())
+ {
+ //initialize preconditioner:
+ //std::cout << "Start CPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End CPU precond" << std::endl;
+ }
+
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ //Note: Since vec can be a rather arbitrary vector type, we call the more generic version in the backend manually:
+ {
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_.handle2());
+ NumericType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(L_.handle());
+
+ viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, L_.size2(), unit_lower_tag());
+ }
+ {
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_.handle2());
+ NumericType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(U_.handle());
+
+ viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, U_.size2(), upper_tag());
+ }
+ }
+
+private:
+ void init(MatrixT const & mat)
+ {
+ viennacl::context host_context(viennacl::MAIN_MEMORY);
+ viennacl::compressed_matrix<NumericType> temp;
+ viennacl::switch_memory_context(temp, host_context);
+ viennacl::switch_memory_context(L_, host_context);
+ viennacl::switch_memory_context(U_, host_context);
+
+ viennacl::copy(mat, temp);
+
+ viennacl::linalg::precondition(temp, L_, U_, tag_);
+ }
+
+ ilut_tag tag_;
+ viennacl::compressed_matrix<NumericType> L_;
+ viennacl::compressed_matrix<NumericType> U_;
+};
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ilut_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+typedef viennacl::compressed_matrix<NumericT, AlignmentV> MatrixType;
+
+public:
+ ilut_precond(MatrixType const & mat, ilut_tag const & tag)
+ : tag_(tag),
+ L_(mat.size1(), mat.size2(), viennacl::traits::context(mat)),
+ U_(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+ {
+ //initialize preconditioner:
+ //std::cout << "Start GPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End GPU precond" << std::endl;
+ }
+
+ void apply(viennacl::vector<NumericT> & vec) const
+ {
+ if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+ {
+ if (tag_.use_level_scheduling())
+ {
+ //std::cout << "Using multifrontal on GPU..." << std::endl;
+ detail::level_scheduling_substitute(vec,
+ multifrontal_L_row_index_arrays_,
+ multifrontal_L_row_buffers_,
+ multifrontal_L_col_buffers_,
+ multifrontal_L_element_buffers_,
+ multifrontal_L_row_elimination_num_list_);
+
+ vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+ detail::level_scheduling_substitute(vec,
+ multifrontal_U_row_index_arrays_,
+ multifrontal_U_row_buffers_,
+ multifrontal_U_col_buffers_,
+ multifrontal_U_element_buffers_,
+ multifrontal_U_row_elimination_num_list_);
+
+ }
+ else
+ {
+ viennacl::context host_context(viennacl::MAIN_MEMORY);
+ viennacl::context old_context = viennacl::traits::context(vec);
+ viennacl::switch_memory_context(vec, host_context);
+ viennacl::linalg::inplace_solve(L_, vec, unit_lower_tag());
+ viennacl::linalg::inplace_solve(U_, vec, upper_tag());
+ viennacl::switch_memory_context(vec, old_context);
+ }
+ }
+ else //apply ILUT directly:
+ {
+ viennacl::linalg::inplace_solve(L_, vec, unit_lower_tag());
+ viennacl::linalg::inplace_solve(U_, vec, upper_tag());
+ }
+ }
+
+private:
+ void init(MatrixType const & mat)
+ {
+ viennacl::context host_context(viennacl::MAIN_MEMORY);
+ viennacl::switch_memory_context(L_, host_context);
+ viennacl::switch_memory_context(U_, host_context);
+
+ if (viennacl::traits::context(mat).memory_type() == viennacl::MAIN_MEMORY)
+ {
+ viennacl::linalg::precondition(mat, L_, U_, tag_);
+ }
+ else //we need to copy to CPU
+ {
+ viennacl::compressed_matrix<NumericT> cpu_mat(mat.size1(), mat.size2(), viennacl::traits::context(mat));
+ viennacl::switch_memory_context(cpu_mat, host_context);
+
+ cpu_mat = mat;
+
+ viennacl::linalg::precondition(cpu_mat, L_, U_, tag_);
+ }
+
+ if (!tag_.use_level_scheduling())
+ return;
+
+ //
+ // multifrontal part:
+ //
+
+ viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+ multifrontal_U_diagonal_.resize(U_.size1(), false);
+ host_based::detail::row_info(U_, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+ detail::level_scheduling_setup_L(L_,
+ multifrontal_U_diagonal_, //dummy
+ multifrontal_L_row_index_arrays_,
+ multifrontal_L_row_buffers_,
+ multifrontal_L_col_buffers_,
+ multifrontal_L_element_buffers_,
+ multifrontal_L_row_elimination_num_list_);
+
+
+ detail::level_scheduling_setup_U(U_,
+ multifrontal_U_diagonal_,
+ multifrontal_U_row_index_arrays_,
+ multifrontal_U_row_buffers_,
+ multifrontal_U_col_buffers_,
+ multifrontal_U_element_buffers_,
+ multifrontal_U_row_elimination_num_list_);
+
+ //
+ // Bring to device if necessary:
+ //
+
+ // L:
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_L_row_index_arrays_.begin();
+ it != multifrontal_L_row_index_arrays_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_L_row_buffers_.begin();
+ it != multifrontal_L_row_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_L_col_buffers_.begin();
+ it != multifrontal_L_col_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_L_element_buffers_.begin();
+ it != multifrontal_L_element_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+ // U:
+
+ viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_U_row_index_arrays_.begin();
+ it != multifrontal_U_row_index_arrays_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_U_row_buffers_.begin();
+ it != multifrontal_U_row_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_U_col_buffers_.begin();
+ it != multifrontal_U_col_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+ for (typename std::list< viennacl::backend::mem_handle >::iterator it = multifrontal_U_element_buffers_.begin();
+ it != multifrontal_U_element_buffers_.end();
+ ++it)
+ viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+ }
+
+ ilut_tag tag_;
+ viennacl::compressed_matrix<NumericT> L_;
+ viennacl::compressed_matrix<NumericT> U_;
+
+ std::list<viennacl::backend::mem_handle> multifrontal_L_row_index_arrays_;
+ std::list<viennacl::backend::mem_handle> multifrontal_L_row_buffers_;
+ std::list<viennacl::backend::mem_handle> multifrontal_L_col_buffers_;
+ std::list<viennacl::backend::mem_handle> multifrontal_L_element_buffers_;
+ std::list<vcl_size_t > multifrontal_L_row_elimination_num_list_;
+
+ viennacl::vector<NumericT> multifrontal_U_diagonal_;
+ std::list<viennacl::backend::mem_handle> multifrontal_U_row_index_arrays_;
+ std::list<viennacl::backend::mem_handle> multifrontal_U_row_buffers_;
+ std::list<viennacl::backend::mem_handle> multifrontal_U_col_buffers_;
+ std::list<viennacl::backend::mem_handle> multifrontal_U_element_buffers_;
+ std::list<vcl_size_t > multifrontal_U_row_elimination_num_list_;
+};
+
+} // namespace linalg
+} // namespace viennacl
+
+
+
+
+#endif
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
new file mode 100644
index 0000000..0e2abb0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
@@ -0,0 +1,103 @@
+#ifndef VIENNACL_LINALG_DETAIL_OP_APPLIER_HPP
+#define VIENNACL_LINALG_DETAIL_OP_APPLIER_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/op_applier.hpp
+ *
+ * @brief Defines the action of certain unary and binary operators and its arguments (for host execution).
+*/
+
+#include "viennacl/forwards.h"
+#include <cmath>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+/** @brief Worker class for decomposing expression templates.
+ *
+ * @tparam A Type to which is assigned to
+ * @tparam OP One out of {op_assign, op_inplace_add, op_inplace_sub}
+ @ @tparam T Right hand side of the assignment
+*/
+template<typename OpT>
+struct op_applier
+{
+ typedef typename OpT::ERROR_UNKNOWN_OP_TAG_PROVIDED error_type;
+};
+
+/** \cond */
+template<>
+struct op_applier<op_element_binary<op_prod> >
+{
+ template<typename T>
+ static void apply(T & result, T const & x, T const & y) { result = x * y; }
+};
+
+template<>
+struct op_applier<op_element_binary<op_div> >
+{
+ template<typename T>
+ static void apply(T & result, T const & x, T const & y) { result = x / y; }
+};
+
+template<>
+struct op_applier<op_element_binary<op_pow> >
+{
+ template<typename T>
+ static void apply(T & result, T const & x, T const & y) { result = std::pow(x, y); }
+};
+
+#define VIENNACL_MAKE_UNARY_OP_APPLIER(funcname) \
+template<> \
+struct op_applier<op_element_unary<op_##funcname> > \
+{ \
+ template<typename T> \
+ static void apply(T & result, T const & x) { using namespace std; result = funcname(x); } \
+}
+
+VIENNACL_MAKE_UNARY_OP_APPLIER(abs);
+VIENNACL_MAKE_UNARY_OP_APPLIER(acos);
+VIENNACL_MAKE_UNARY_OP_APPLIER(asin);
+VIENNACL_MAKE_UNARY_OP_APPLIER(atan);
+VIENNACL_MAKE_UNARY_OP_APPLIER(ceil);
+VIENNACL_MAKE_UNARY_OP_APPLIER(cos);
+VIENNACL_MAKE_UNARY_OP_APPLIER(cosh);
+VIENNACL_MAKE_UNARY_OP_APPLIER(exp);
+VIENNACL_MAKE_UNARY_OP_APPLIER(fabs);
+VIENNACL_MAKE_UNARY_OP_APPLIER(floor);
+VIENNACL_MAKE_UNARY_OP_APPLIER(log);
+VIENNACL_MAKE_UNARY_OP_APPLIER(log10);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sin);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sinh);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sqrt);
+VIENNACL_MAKE_UNARY_OP_APPLIER(tan);
+VIENNACL_MAKE_UNARY_OP_APPLIER(tanh);
+
+#undef VIENNACL_MAKE_UNARY_OP_APPLIER
+/** \endcond */
+
+}
+}
+}
+
+#endif // VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
new file mode 100644
index 0000000..bd49b3b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
@@ -0,0 +1,86 @@
+#ifndef VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
+#define VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/op_executor.hpp
+ *
+ * @brief Defines the worker class for decomposing an expression tree into small chunks, which can be processed by the predefined operations in ViennaCL.
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+template<typename NumericT, typename B>
+bool op_aliasing(vector_base<NumericT> const & /*lhs*/, B const & /*b*/)
+{
+ return false;
+}
+
+template<typename NumericT>
+bool op_aliasing(vector_base<NumericT> const & lhs, vector_base<NumericT> const & b)
+{
+ return lhs.handle() == b.handle();
+}
+
+template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
+bool op_aliasing(vector_base<NumericT> const & lhs, vector_expression<const LhsT, const RhsT, OpT> const & rhs)
+{
+ return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
+}
+
+
+template<typename NumericT, typename B>
+bool op_aliasing(matrix_base<NumericT> const & /*lhs*/, B const & /*b*/)
+{
+ return false;
+}
+
+template<typename NumericT>
+bool op_aliasing(matrix_base<NumericT> const & lhs, matrix_base<NumericT> const & b)
+{
+ return lhs.handle() == b.handle();
+}
+
+template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
+bool op_aliasing(matrix_base<NumericT> const & lhs, matrix_expression<const LhsT, const RhsT, OpT> const & rhs)
+{
+ return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
+}
+
+
+/** @brief Worker class for decomposing expression templates.
+ *
+ * @tparam A Type to which is assigned to
+ * @tparam OP One out of {op_assign, op_inplace_add, op_inplace_sub}
+ @ @tparam T Right hand side of the assignment
+*/
+template<typename A, typename OP, typename T>
+struct op_executor {};
+
+}
+}
+}
+
+#endif // VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
new file mode 100644
index 0000000..12ff77b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
@@ -0,0 +1,86 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_BLOCK_MATRIX_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/tools/tools.hpp"
+
+/** @file viennacl/linalg/detail/spai/block_matrix.hpp
+ @brief Implementation of a bunch of (small) matrices on GPU. Experimental.
+
+ SPAI code contributed by Nikolay Lukash
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+* @brief Represents contigious matrices on GPU
+*/
+
+class block_matrix
+{
+public:
+
+ ////////// non-const
+
+ /** @brief Returns a handle to the elements */
+ viennacl::ocl::handle<cl_mem>& handle(){ return elements_; }
+
+ /** @brief Returns a handle to the matrix dimensions */
+ viennacl::ocl::handle<cl_mem>& handle1() { return matrix_dimensions_; }
+
+ /** @brief Returns a handle to the start indices of matrix */
+ viennacl::ocl::handle<cl_mem>& handle2() { return start_block_inds_; }
+
+ ////////// const
+
+ /** @brief Returns a handle to the const elements */
+ const viennacl::ocl::handle<cl_mem>& handle() const { return elements_; }
+
+ /** @brief Returns a handle to the const matrix dimensions */
+ const viennacl::ocl::handle<cl_mem>& handle1() const { return matrix_dimensions_; }
+
+ /** @brief Returns a handle to the const start indices of matrix */
+ const viennacl::ocl::handle<cl_mem>& handle2() const { return start_block_inds_; }
+
+private:
+ viennacl::ocl::handle<cl_mem> elements_;
+ viennacl::ocl::handle<cl_mem> matrix_dimensions_;
+ viennacl::ocl::handle<cl_mem> start_block_inds_;
+};
+
+
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
new file mode 100644
index 0000000..eee6aef
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
@@ -0,0 +1,77 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_BLOCK_VECTOR_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_VECTOR_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/tools/tools.hpp"
+
+/** @file viennacl/linalg/detail/spai/block_vector.hpp
+ @brief Implementation of a bunch of vectors on GPU. Experimental.
+
+ SPAI code contributed by Nikolay Lukash
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+* @brief Represents a contiguous vector on the GPU to represent a concatentation of small vectors
+*/
+class block_vector
+{
+public:
+
+ ///////////// non-const
+
+ /** @brief Return handle to the elements */
+ viennacl::ocl::handle<cl_mem> & handle(){ return elements_; }
+
+ /** @brief Return handle to start indices */
+ viennacl::ocl::handle<cl_mem> & handle1() { return start_block_inds_; }
+
+ ///////////// const
+
+ /** @brief Return handle to the const elements */
+ const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
+
+ /** @brief Return handle to const start indices */
+ const viennacl::ocl::handle<cl_mem> & handle1() const { return start_block_inds_; }
+
+private:
+ viennacl::ocl::handle<cl_mem> elements_;
+ viennacl::ocl::handle<cl_mem> start_block_inds_;
+};
+
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
new file mode 100644
index 0000000..fab81d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
@@ -0,0 +1,402 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_FSPAI_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_FSPAI_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+//boost includes
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+//#include <omp.h>
+
+/** @file viennacl/linalg/detail/spai/fspai.hpp
+ @brief Implementation of FSPAI. Experimental.
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief A tag for FSPAI. Experimental.
+*
+* Contains values for the algorithm.
+* Must be passed to spai_precond constructor
+*/
+class fspai_tag
+{
+public:
+ /** @brief Constructor
+ *
+ * @param residual_norm_threshold Calculate until the norm of the residual falls below this threshold
+ * @param iteration_limit maximum number of iterations
+ * @param is_static determines if static version of SPAI should be used
+ * @param is_right determines if left or right preconditioner should be used
+ */
+ fspai_tag(
+ double residual_norm_threshold = 1e-3,
+ unsigned int iteration_limit = 5,
+ bool is_static = false,
+ bool is_right = false)
+ : residual_norm_threshold_(residual_norm_threshold),
+ iteration_limit_(iteration_limit),
+ is_static_(is_static),
+ is_right_(is_right) {}
+
+ inline double getResidualNormThreshold() const { return residual_norm_threshold_; }
+ inline unsigned long getIterationLimit () const { return iteration_limit_; }
+ inline bool getIsStatic() const { return is_static_; }
+ inline bool getIsRight() const { return is_right_; }
+ inline void setResidualNormThreshold(double residual_norm_threshold)
+ {
+ if (residual_norm_threshold > 0)
+ residual_norm_threshold_ = residual_norm_threshold;
+ }
+ inline void setIterationLimit(unsigned long iteration_limit)
+ {
+ if (iteration_limit > 0)
+ iteration_limit_ = iteration_limit;
+ }
+ inline void setIsRight(bool is_right) { is_right_ = is_right; }
+ inline void setIsStatic(bool is_static) { is_static_ = is_static; }
+
+private:
+ double residual_norm_threshold_;
+ unsigned long iteration_limit_;
+ bool is_static_;
+ bool is_right_;
+};
+
+
+//
+// Helper: Store A in an STL container of type, exploiting symmetry
+// Reason: ublas interface does not allow to iterate over nonzeros of a particular row without starting an iterator1 from the very beginning of the matrix...
+//
+template<typename MatrixT, typename NumericT>
+void sym_sparse_matrix_to_stl(MatrixT const & A, std::vector<std::map<unsigned int, NumericT> > & STL_A)
+{
+ STL_A.resize(A.size1());
+ for (typename MatrixT::const_iterator1 row_it = A.begin1();
+ row_it != A.end1();
+ ++row_it)
+ {
+ for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ {
+ if (col_it.index1() >= col_it.index2())
+ STL_A[col_it.index1()][static_cast<unsigned int>(col_it.index2())] = *col_it;
+ else
+ break; //go to next row
+ }
+ }
+}
+
+
+//
+// Generate index sets J_k, k=0,...,N-1
+//
+template<typename MatrixT>
+void generateJ(MatrixT const & A, std::vector<std::vector<vcl_size_t> > & J)
+{
+ for (typename MatrixT::const_iterator1 row_it = A.begin1();
+ row_it != A.end1();
+ ++row_it)
+ {
+ for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ {
+ if (col_it.index1() > col_it.index2()) //Matrix is symmetric, thus only work on lower triangular part
+ {
+ J[col_it.index2()].push_back(col_it.index1());
+ J[col_it.index1()].push_back(col_it.index2());
+ }
+ else
+ break; //go to next row
+ }
+ }
+}
+
+
+//
+// Extracts the blocks A(\tilde{J}_k, \tilde{J}_k) from A
+// Sets up y_k = A(\tilde{J}_k, k) for the inplace-solution after Cholesky-factoriation
+//
+template<typename NumericT, typename MatrixT, typename VectorT>
+void fill_blocks(std::vector< std::map<unsigned int, NumericT> > & A,
+ std::vector<MatrixT> & blocks,
+ std::vector<std::vector<vcl_size_t> > const & J,
+ std::vector<VectorT> & Y)
+{
+ for (vcl_size_t k=0; k<A.size(); ++k)
+ {
+ std::vector<vcl_size_t> const & Jk = J[k];
+ VectorT & yk = Y[k];
+ MatrixT & block_k = blocks[k];
+
+ yk.resize(Jk.size());
+ block_k.resize(Jk.size(), Jk.size());
+ block_k.clear();
+
+ for (vcl_size_t i=0; i<Jk.size(); ++i)
+ {
+ vcl_size_t row_index = Jk[i];
+ std::map<unsigned int, NumericT> & A_row = A[row_index];
+
+ //fill y_k:
+ yk[i] = A_row[static_cast<unsigned int>(k)];
+
+ for (vcl_size_t j=0; j<Jk.size(); ++j)
+ {
+ vcl_size_t col_index = Jk[j];
+ if (col_index <= row_index && A_row.find(static_cast<unsigned int>(col_index)) != A_row.end()) //block is symmetric, thus store only lower triangular part
+ block_k(i, j) = A_row[static_cast<unsigned int>(col_index)];
+ }
+ }
+ }
+}
+
+
+//
+// Perform Cholesky factorization of A inplace. Cf. Schwarz: Numerische Mathematik, vol 5, p. 58
+//
+template<typename MatrixT>
+void cholesky_decompose(MatrixT & A)
+{
+ for (vcl_size_t k=0; k<A.size2(); ++k)
+ {
+ if (A(k,k) <= 0)
+ {
+ std::cout << "k: " << k << std::endl;
+ std::cout << "A(k,k): " << A(k,k) << std::endl;
+ }
+
+ assert(A(k,k) > 0 && bool("Matrix not positive definite in Cholesky factorization."));
+
+ A(k,k) = std::sqrt(A(k,k));
+
+ for (vcl_size_t i=k+1; i<A.size1(); ++i)
+ {
+ A(i,k) /= A(k,k);
+ for (vcl_size_t j=k+1; j<=i; ++j)
+ A(i,j) -= A(i,k) * A(j,k);
+ }
+ }
+}
+
+
+//
+// Compute x in Ax = b, where A is already Cholesky factored (A = L L^T)
+//
+template<typename MatrixT, typename VectorT>
+void cholesky_solve(MatrixT const & L, VectorT & b)
+{
+ // inplace forward solve L x = b
+ for (vcl_size_t i=0; i<L.size1(); ++i)
+ {
+ for (vcl_size_t j=0; j<i; ++j)
+ b[i] -= L(i,j) * b[j];
+ b[i] /= L(i,i);
+ }
+
+ // inplace backward solve L^T x = b:
+ for (vcl_size_t i=L.size1()-1;; --i)
+ {
+ for (vcl_size_t k=i+1; k<L.size1(); ++k)
+ b[i] -= L(k,i) * b[k];
+ b[i] /= L(i,i);
+
+ if (i==0) //vcl_size_t might be unsigned, therefore manual check for equality with zero here
+ break;
+ }
+}
+
+
+
+//
+// Compute the Cholesky factor L from the sparse vectors y_k
+//
+template<typename MatrixT, typename VectorT>
+void computeL(MatrixT const & A,
+ MatrixT & L,
+ MatrixT & L_trans,
+ std::vector<VectorT> & Y,
+ std::vector<std::vector<vcl_size_t> > & J)
+{
+ typedef typename VectorT::value_type NumericType;
+ typedef std::vector<std::map<unsigned int, NumericType> > STLSparseMatrixType;
+
+ STLSparseMatrixType L_temp(A.size1());
+
+ for (vcl_size_t k=0; k<A.size1(); ++k)
+ {
+ std::vector<vcl_size_t> const & Jk = J[k];
+ VectorT const & yk = Y[k];
+
+ //compute L(k,k):
+ NumericType Lkk = A(k,k);
+ for (vcl_size_t i=0; i<Jk.size(); ++i)
+ Lkk -= A(Jk[i],k) * yk[i];
+
+ Lkk = NumericType(1) / std::sqrt(Lkk);
+ L_temp[k][static_cast<unsigned int>(k)] = Lkk;
+ L_trans(k,k) = Lkk;
+
+ //write lower diagonal entries:
+ for (vcl_size_t i=0; i<Jk.size(); ++i)
+ {
+ L_temp[Jk[i]][static_cast<unsigned int>(k)] = -Lkk * yk[i];
+ L_trans(k, Jk[i]) = -Lkk * yk[i];
+ }
+ } //for k
+
+
+ //build L from L_temp
+ for (vcl_size_t i=0; i<L_temp.size(); ++i)
+ for (typename std::map<unsigned int, NumericType>::const_iterator it = L_temp[i].begin();
+ it != L_temp[i].end();
+ ++it)
+ L(i, it->first) = it->second;
+}
+
+
+//
+// Top level FSPAI function
+//
+template<typename MatrixT>
+void computeFSPAI(MatrixT const & A,
+ MatrixT const & PatternA,
+ MatrixT & L,
+ MatrixT & L_trans,
+ fspai_tag)
+{
+ typedef typename MatrixT::value_type NumericT;
+ typedef boost::numeric::ublas::matrix<NumericT> DenseMatrixType;
+ typedef std::vector<std::map<unsigned int, NumericT> > SparseMatrixType;
+
+ //
+ // preprocessing: Store A in a STL container:
+ //
+ //std::cout << "Transferring to STL container:" << std::endl;
+ std::vector<std::vector<NumericT> > y_k(A.size1());
+ SparseMatrixType STL_A(A.size1());
+ sym_sparse_matrix_to_stl(A, STL_A);
+
+
+ //
+ // Step 1: Generate pattern indices
+ //
+ //std::cout << "computeFSPAI(): Generating pattern..." << std::endl;
+ std::vector<std::vector<vcl_size_t> > J(A.size1());
+ generateJ(PatternA, J);
+
+ //
+ // Step 2: Set up matrix blocks
+ //
+ //std::cout << "computeFSPAI(): Setting up matrix blocks..." << std::endl;
+ std::vector<DenseMatrixType> subblocks_A(A.size1());
+ fill_blocks(STL_A, subblocks_A, J, y_k);
+ STL_A.clear(); //not needed anymore
+
+ //
+ // Step 3: Cholesky-factor blocks
+ //
+ //std::cout << "computeFSPAI(): Cholesky-factorization..." << std::endl;
+ for (vcl_size_t i=0; i<subblocks_A.size(); ++i)
+ {
+ //std::cout << "Block before: " << subblocks_A[i] << std::endl;
+ cholesky_decompose(subblocks_A[i]);
+ //std::cout << "Block after: " << subblocks_A[i] << std::endl;
+ }
+
+
+ /*vcl_size_t num_bytes = 0;
+ for (vcl_size_t i=0; i<subblocks_A.size(); ++i)
+ num_bytes += 8*subblocks_A[i].size1()*subblocks_A[i].size2();*/
+ //std::cout << "Memory for FSPAI matrix: " << num_bytes / (1024.0 * 1024.0) << " MB" << std::endl;
+
+ //
+ // Step 4: Solve for y_k
+ //
+ //std::cout << "computeFSPAI(): Cholesky-solve..." << std::endl;
+ for (vcl_size_t i=0; i<y_k.size(); ++i)
+ {
+ if (subblocks_A[i].size1() > 0) //block might be empty...
+ {
+ //y_k[i].resize(subblocks_A[i].size1());
+ //std::cout << "y_k[" << i << "]: ";
+ //for (vcl_size_t j=0; j<y_k[i].size(); ++j)
+ // std::cout << y_k[i][j] << " ";
+ //std::cout << std::endl;
+ cholesky_solve(subblocks_A[i], y_k[i]);
+ }
+ }
+
+
+ //
+ // Step 5: Set up Cholesky factors L and L_trans
+ //
+ //std::cout << "computeFSPAI(): Computing L..." << std::endl;
+ L.resize(A.size1(), A.size2(), false);
+ L.reserve(A.nnz(), false);
+ L_trans.resize(A.size1(), A.size2(), false);
+ L_trans.reserve(A.nnz(), false);
+ computeL(A, L, L_trans, y_k, J);
+
+ //std::cout << "L: " << L << std::endl;
+}
+
+
+
+}
+}
+}
+}
+
+#endif
[20/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
new file mode 100644
index 0000000..f23223f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
@@ -0,0 +1,2052 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/matrix_operations.hpp
+ @brief Implementations of dense matrix related operations, including matrix-vector products, using a plain single-threaded or OpenMP-enabled execution on CPU.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// Minimum Matrix size(size1*size2) for using OpenMP on matrix operations:
+#ifndef VIENNACL_OPENMP_MATRIX_MIN_SIZE
+ #define VIENNACL_OPENMP_MATRIX_MIN_SIZE 5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2)
+{
+ assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ DestNumericT * data_A = detail::extract_raw_pointer<DestNumericT>(mat1);
+ SrcNumericT const * data_B = detail::extract_raw_pointer<SrcNumericT>(mat2);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+ vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
+
+ if (mat1.row_major())
+ {
+ detail::matrix_array_wrapper<DestNumericT, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<SrcNumericT const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<DestNumericT, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<SrcNumericT const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col));
+ }
+}
+
+
+
+template<typename NumericT,
+ typename SizeT, typename DistanceT>
+void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,
+ const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy, matrix_base<NumericT> & temp_trans)
+{
+ typedef NumericT value_type;
+ const value_type * data_A = detail::extract_raw_pointer<value_type>(proxy.lhs());
+ value_type * data_B = detail::extract_raw_pointer<value_type>(temp_trans);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(proxy.lhs());
+ vcl_size_t A_start2 = viennacl::traits::start2(proxy.lhs());
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
+ vcl_size_t A_inc1 = viennacl::traits::stride1(proxy.lhs());
+ vcl_size_t A_inc2 = viennacl::traits::stride2(proxy.lhs());
+ vcl_size_t A_size1 = viennacl::traits::size1(proxy.lhs());
+ vcl_size_t A_size2 = viennacl::traits::size2(proxy.lhs());
+
+ vcl_size_t B_start1 = viennacl::traits::start1(temp_trans);
+ vcl_size_t B_start2 = viennacl::traits::start2(temp_trans);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(temp_trans);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(temp_trans);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(temp_trans);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(temp_trans);
+
+ const vcl_size_t sub_mat_size = 64; //The matrix will be divided into sub-matrices for better storage access.
+
+ vcl_size_t row_count = A_size1 / sub_mat_size;
+ vcl_size_t col_count = A_size2 / sub_mat_size;
+
+ vcl_size_t row_count_remainder = A_size1 % sub_mat_size;
+ vcl_size_t col_count_remainder = A_size2 % sub_mat_size;
+
+ if (proxy.lhs().row_major())
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition
+ {
+ vcl_size_t row = vcl_size_t(i) / col_count;
+ vcl_size_t col = vcl_size_t(i) % col_count;
+
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size)
+ , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size)
+ , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < (sub_mat_size); ++j)
+ for(vcl_size_t k = 0; k < (sub_mat_size); ++k)
+ wrapper_B(j, k) = wrapper_A(k, j);
+ }
+ { //This is the transposition of the remainder on the right side of the matrix
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1
+ , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col_count * sub_mat_size)
+ , B_start2, B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < col_count_remainder; ++j)
+ for(vcl_size_t k = 0 ; k < A_size1; ++k)
+ wrapper_B(j, k) = wrapper_A(k, j);
+ }
+ { //This is the transposition of the remainder on the bottom side of the matrix
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size)
+ , A_start2, A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B,B_start1
+ , B_start2 + B_inc2 * (row_count * sub_mat_size), B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < row_count_remainder; ++j)
+ for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k)
+ wrapper_B(k, j) = wrapper_A(j, k);
+ }
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition
+ {
+ vcl_size_t row = vcl_size_t(i) / col_count;
+ vcl_size_t col = vcl_size_t(i) % col_count;
+
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size)
+ , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size)
+ , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < (sub_mat_size); ++j)
+ for(vcl_size_t k = 0; k < (sub_mat_size); ++k)
+ wrapper_B(k, j)=wrapper_A(j, k);
+ }
+ { //This is the transposition of the remainder on the right side of the matrix
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1
+ , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B,B_start1 + B_inc1 * (col_count * sub_mat_size)
+ , B_start2, B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < col_count_remainder; ++j)
+ for(vcl_size_t k = 0; k < A_size1; ++k)
+ wrapper_B(j, k)=wrapper_A(k, j);
+ }
+ { //This is the transposition of the remainder on the bottom side of the matrix
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size)
+ , A_start2, A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B, B_start1
+ , B_start2 + B_inc2 * (row_count * sub_mat_size), B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < row_count_remainder; ++j)
+ for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k)
+ wrapper_B(k, j)=wrapper_A(j, k);
+ }
+ }
+}
+
+template<typename NumericT, typename ScalarT1>
+void am(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+ vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
+
+ if (mat1.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+ if (reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
+ }
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+ if (reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
+ }
+ }
+}
+
+
+template<typename NumericT,
+ typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+ value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = -data_beta;
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+ vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
+
+ vcl_size_t C_start1 = viennacl::traits::start1(mat3);
+ vcl_size_t C_start2 = viennacl::traits::start2(mat3);
+ vcl_size_t C_inc1 = viennacl::traits::stride1(mat3);
+ vcl_size_t C_inc2 = viennacl::traits::stride2(mat3);
+ vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3);
+ vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3);
+
+ if (mat1.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ if (reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ else if (!reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (!reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ if (reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ else if (!reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (!reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ }
+
+}
+
+
+template<typename NumericT,
+ typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+ value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = -data_beta;
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+ vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
+
+ vcl_size_t C_start1 = viennacl::traits::start1(mat3);
+ vcl_size_t C_start2 = viennacl::traits::start2(mat3);
+ vcl_size_t C_inc1 = viennacl::traits::stride1(mat3);
+ vcl_size_t C_inc2 = viennacl::traits::stride2(mat3);
+ vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3);
+ vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3);
+
+ if (mat1.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ if (reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ else if (!reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (!reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ if (reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ else if (!reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (!reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ }
+
+}
+
+
+
+
+template<typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type alpha = static_cast<value_type>(s);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ vcl_size_t A_size1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+ vcl_size_t A_size2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(static_cast<vcl_size_t>(row), col) = alpha;
+ //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+ // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, static_cast<vcl_size_t>(col)) = alpha;
+ //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+ // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
+ }
+}
+
+
+
+template<typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type alpha = static_cast<value_type>(s);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ wrapper_A(row, row) = alpha;
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ wrapper_A(row, row) = alpha;
+ }
+}
+
+template<typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+ typedef NumericT value_type;
+
+ value_type *data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type const *data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ //vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t v_start = viennacl::traits::start(vec);
+ vcl_size_t v_inc = viennacl::traits::stride(vec);
+ vcl_size_t v_size = viennacl::traits::size(vec);
+
+ vcl_size_t row_start = 0;
+ vcl_size_t col_start = 0;
+
+ if (k >= 0)
+ col_start = static_cast<vcl_size_t>(k);
+ else
+ row_start = static_cast<vcl_size_t>(-k);
+
+ matrix_assign(mat, NumericT(0));
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
+ }
+}
+
+template<typename NumericT>
+void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ //vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t v_start = viennacl::traits::start(vec);
+ vcl_size_t v_inc = viennacl::traits::stride(vec);
+ vcl_size_t v_size = viennacl::traits::size(vec);
+
+ vcl_size_t row_start = 0;
+ vcl_size_t col_start = 0;
+
+ if (k >= 0)
+ col_start = static_cast<vcl_size_t>(k);
+ else
+ row_start = static_cast<vcl_size_t>(-k);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
+ }
+}
+
+template<typename NumericT>
+void matrix_row(const matrix_base<NumericT> & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ //vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t v_start = viennacl::traits::start(vec);
+ vcl_size_t v_inc = viennacl::traits::stride(vec);
+ vcl_size_t v_size = viennacl::traits::size(vec);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t j = 0; j < v_size; ++j)
+ data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j);
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t j = 0; j < v_size; ++j)
+ data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j);
+ }
+}
+
+template<typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ //vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t v_start = viennacl::traits::start(vec);
+ vcl_size_t v_inc = viennacl::traits::stride(vec);
+ vcl_size_t v_size = viennacl::traits::size(vec);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j));
+ }
+}
+
+//
+///////////////////////// Element-wise operation //////////////////////////////////
+//
+
+// Binary operations A = B .* C and A = B ./ C
+
+/** @brief Implementation of the element-wise operations A = B .* C and A = B ./ C (using MATLAB syntax)
+*
+* @param A The result matrix (or -range, or -slice)
+* @param proxy The proxy object holding B, C, and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+ typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> > OpFunctor;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(A);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
+ value_type const * data_C = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+ vcl_size_t A_start1 = viennacl::traits::start1(A);
+ vcl_size_t A_start2 = viennacl::traits::start2(A);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(A);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(A);
+ vcl_size_t A_size1 = viennacl::traits::size1(A);
+ vcl_size_t A_size2 = viennacl::traits::size2(A);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
+ vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
+ vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs());
+ vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs());
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
+
+ vcl_size_t C_start1 = viennacl::traits::start1(proxy.rhs());
+ vcl_size_t C_start2 = viennacl::traits::start2(proxy.rhs());
+ vcl_size_t C_inc1 = viennacl::traits::stride1(proxy.rhs());
+ vcl_size_t C_inc2 = viennacl::traits::stride2(proxy.rhs());
+ vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(proxy.rhs());
+ vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(proxy.rhs());
+
+ if (A.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
+ //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+ // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
+ // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
+
+ //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+ // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
+ // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
+ }
+}
+
+// Unary operations
+
+// A = op(B)
+template<typename NumericT, typename OpT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+ typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> > OpFunctor;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(A);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+ vcl_size_t A_start1 = viennacl::traits::start1(A);
+ vcl_size_t A_start2 = viennacl::traits::start2(A);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(A);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(A);
+ vcl_size_t A_size1 = viennacl::traits::size1(A);
+ vcl_size_t A_size2 = viennacl::traits::size2(A);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
+ vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
+ vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs());
+ vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs());
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
+
+ if (A.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
+ }
+}
+
+
+
+//
+///////////////////////// matrix-vector products /////////////////////////////////
+//
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param trans Flag whether mat is to be transposed
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT>
+void prod_impl(const matrix_base<NumericT> & mat, bool trans,
+ const vector_base<NumericT> & vec,
+ vector_base<NumericT> & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type const * data_x = detail::extract_raw_pointer<value_type>(vec);
+ value_type * data_result = detail::extract_raw_pointer<value_type>(result);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t start1 = viennacl::traits::start(vec);
+ vcl_size_t inc1 = viennacl::traits::stride(vec);
+
+ vcl_size_t start2 = viennacl::traits::start(result);
+ vcl_size_t inc2 = viennacl::traits::stride(result);
+
+ if (mat.row_major())
+ {
+ if (trans)
+ {
+ vcl_size_t thread_count = 1;
+#ifdef VIENNACL_WITH_OPENMP
+ if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+ std::vector<value_type> temp_array(A_size2*thread_count, 0);
+ detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2);
+
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_res(col) = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+ id = omp_get_thread_num();
+ #endif
+ vcl_size_t begin = (A_size1 * id) / thread_count;
+ vcl_size_t end = (A_size1 * (id + 1)) / thread_count;
+
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_mat(data_A, A_start1 + A_inc1 * begin, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1);
+
+ for (vcl_size_t row = 0; row < (end - begin); ++row) //run through matrix sequentially
+ {
+ value_type temp = wrapper_vec(row);
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ temp_array[A_size2 * id + col] += wrapper_mat(row , col) * temp;
+ }
+ }
+ for (vcl_size_t id = 0; id < thread_count; ++id)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_res(col) += temp_array[A_size2 * id + col];
+ }
+
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ {
+ value_type temp = 0;
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ temp += data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
+
+ data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp;
+ }
+ }
+ }
+ else
+ {
+ if (!trans)
+ {
+ vcl_size_t thread_count = 1;
+#ifdef VIENNACL_WITH_OPENMP
+ if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+ std::vector<value_type> temp_array(A_size1*thread_count, 0);
+ detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2);
+
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_res(row) = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+ id = omp_get_thread_num();
+ #endif
+ vcl_size_t begin = (A_size2 * id) / thread_count;
+ vcl_size_t end = (A_size2 * (id + 1)) / thread_count;
+
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_mat(data_A, A_start1, A_start2 + A_inc2 * begin, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1);
+
+ for (vcl_size_t col = 0; col < (end - begin); ++col) //run through matrix sequentially
+ {
+ value_type temp = wrapper_vec(col);
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ temp_array[A_size1 * id + row] += wrapper_mat(row , col) * temp;
+ }
+ }
+ for (vcl_size_t id = 0; id < thread_count; ++id)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_res(row) += temp_array[A_size1 * id + row];
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size2); ++row)
+ {
+ value_type temp = 0;
+ for (vcl_size_t col = 0; col < A_size1; ++col)
+ temp += data_A[viennacl::column_major::mem_index(col * A_inc1 + A_start1, static_cast<vcl_size_t>(row) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
+
+ data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp;
+ }
+ }
+ }
+}
+
+
+
+//
+///////////////////////// matrix-matrix products /////////////////////////////////
+//
+
+namespace detail
+{
+ template<typename MatrixAccT1, typename MatrixAccT2, typename MatrixAccT3, typename NumericT>
+ void prod(MatrixAccT1 & A, MatrixAccT2 & B, MatrixAccT3 & C,
+ vcl_size_t C_size1, vcl_size_t C_size2, vcl_size_t A_size2,
+ NumericT alpha, NumericT beta)
+ {
+ if (C_size1 == 0 || C_size2 == 0 || A_size2 == 0)
+ return;
+
+ static const vcl_size_t blocksize = 64;
+
+ vcl_size_t num_blocks_C1 = (C_size1 - 1) / blocksize + 1;
+ vcl_size_t num_blocks_C2 = (C_size2 - 1) / blocksize + 1;
+ vcl_size_t num_blocks_A2 = (A_size2 - 1) / blocksize + 1;
+
+ //
+ // outer loop pair: Run over all blocks with indices (block_idx_i, block_idx_j) of the result matrix C:
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((C_size1*C_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long block_idx_i2=0; block_idx_i2<static_cast<long>(num_blocks_C1); ++block_idx_i2)
+ {
+ // thread-local auxiliary buffers
+ std::vector<NumericT> buffer_A(blocksize * blocksize); // row-major
+ std::vector<NumericT> buffer_B(blocksize * blocksize); // column-major
+ std::vector<NumericT> buffer_C(blocksize * blocksize); // row-major
+
+ vcl_size_t block_idx_i = static_cast<vcl_size_t>(block_idx_i2);
+ for (vcl_size_t block_idx_j=0; block_idx_j<num_blocks_C2; ++block_idx_j)
+ {
+ // Reset block matrix:
+ std::fill(buffer_C.begin(), buffer_C.end(), NumericT(0));
+
+ vcl_size_t offset_i = block_idx_i*blocksize;
+ vcl_size_t offset_j = block_idx_j*blocksize;
+
+ // C(block_idx_i, block_idx_i) += A(block_idx_i, block_idx_k) * B(block_idx_k, block_idx_j)
+ for (vcl_size_t block_idx_k=0; block_idx_k<num_blocks_A2; ++block_idx_k)
+ {
+ // flush buffers:
+ std::fill(buffer_A.begin(), buffer_A.end(), NumericT(0));
+ std::fill(buffer_B.begin(), buffer_B.end(), NumericT(0));
+
+ vcl_size_t offset_k = block_idx_k*blocksize;
+
+ // load current data:
+ for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+ for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k)
+ buffer_A[(i - offset_i) * blocksize + (k - offset_k)] = A(i, k);
+
+ for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+ for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k)
+ buffer_B[(k - offset_k) + (j - offset_j) * blocksize] = B(k, j);
+
+ // multiply (this is the hot spot in terms of flops)
+ for (vcl_size_t i = 0; i < blocksize; ++i)
+ {
+ NumericT const * ptrA = &(buffer_A[i*blocksize]);
+ for (vcl_size_t j = 0; j < blocksize; ++j)
+ {
+ NumericT const * ptrB = &(buffer_B[j*blocksize]);
+
+ NumericT temp = NumericT(0);
+ for (vcl_size_t k = 0; k < blocksize; ++k)
+ temp += ptrA[k] * ptrB[k]; // buffer_A[i*blocksize + k] * buffer_B[k + j*blocksize];
+
+ buffer_C[i*blocksize + j] += temp;
+ }
+ }
+ }
+
+ // write result:
+ if (beta > 0 || beta < 0)
+ {
+ for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+ for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+ C(i,j) = beta * C(i,j) + alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)];
+ }
+ else
+ {
+ for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+ for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+ C(i,j) = alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)];
+ }
+
+ } // for block j
+ } // for block i
+
+ } // prod()
+
+} // namespace detail
+
+/** @brief Carries out matrix-matrix multiplication
+*
+* Implementation of C = prod(A, B);
+*
+*/
+template<typename NumericT, typename ScalarT1, typename ScalarT2 >
+void prod_impl(const matrix_base<NumericT> & A, bool trans_A,
+ const matrix_base<NumericT> & B, bool trans_B,
+ matrix_base<NumericT> & C,
+ ScalarT1 alpha,
+ ScalarT2 beta)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(B);
+ value_type * data_C = detail::extract_raw_pointer<value_type>(C);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(A);
+ vcl_size_t A_start2 = viennacl::traits::start2(A);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(A);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(A);
+ vcl_size_t A_size1 = viennacl::traits::size1(A);
+ vcl_size_t A_size2 = viennacl::traits::size2(A);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(B);
+ vcl_size_t B_start2 = viennacl::traits::start2(B);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(B);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(B);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B);
+
+ vcl_size_t C_start1 = viennacl::traits::start1(C);
+ vcl_size_t C_start2 = viennacl::traits::start2(C);
+ vcl_size_t C_inc1 = viennacl::traits::stride1(C);
+ vcl_size_t C_inc2 = viennacl::traits::stride2(C);
+ vcl_size_t C_size1 = viennacl::traits::size1(C);
+ vcl_size_t C_size2 = viennacl::traits::size2(C);
+ vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(C);
+ vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(C);
+
+ if (!trans_A && !trans_B)
+ {
+ if (A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ }
+ else if (!trans_A && trans_B)
+ {
+ if (A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ }
+ else if (trans_A && !trans_B)
+ {
+ if (A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ }
+ else if (trans_A && trans_B)
+ {
+ if (A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ }
+}
+
+
+
+
+//
+///////////////////////// miscellaneous operations /////////////////////////////////
+//
+
+
+/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+*
+* Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+*
+* @param mat1 The matrix to be updated
+* @param alpha The scaling factor (either a viennacl::scalar<>, float, or double)
+* @param reciprocal_alpha Use 1/alpha instead of alpha
+* @param flip_sign_alpha Use -alpha instead of alpha
+* @param vec1 The first vector
+* @param vec2 The second vector
+*/
+template<typename NumericT, typename ScalarT>
+void scaled_rank_1_update(matrix_base<NumericT> & mat1,
+ ScalarT const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ const vector_base<NumericT> & vec1,
+ const vector_base<NumericT> & vec2)
+{
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
+ value_type const * data_v1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_v2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+
+ if (mat1.row_major())
+ {
+ if(reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ {
+ value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] / data_alpha;
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2];
+ }
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ {
+ value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] * data_alpha;
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2];
+ }
+ }
+ }
+ else
+ {
+ if(reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col) //run through matrix sequentially
+ {
+ value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] / data_alpha;
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += data_v1[row * inc1 + start1] * value_v2;
+ }
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col) //run through matrix sequentially
+ {
+ value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] * data_alpha;
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_intern
<TRUNCATED>
[43/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
new file mode 100644
index 0000000..19f7993
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
@@ -0,0 +1,512 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_MAPPED_TYPE_HPP
+#define VIENNACL_DEVICE_SPECIFIC_MAPPED_TYPE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/mapped_objects.hpp
+ @brief Map ViennaCL objects to generator wrappers
+*/
+
+#include <string>
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/device_specific/forwards.h"
+#include "viennacl/device_specific/utils.hpp"
+
+namespace viennacl
+{
+
+namespace device_specific
+{
+
+/** @brief Mapped Object
+*
+* This object populates the symbolic mapping associated with a statement. (root_id, LHS|RHS|PARENT) => mapped_object
+* The tree can then be reconstructed in its symbolic form
+*/
+class mapped_object
+{
+private:
+ virtual void postprocess(std::string &) const { }
+
+protected:
+ struct MorphBase { virtual ~MorphBase(){} };
+ struct MorphBase1D : public MorphBase { public: virtual std::string operator()(std::string const & i) const = 0; };
+ struct MorphBase2D : public MorphBase { public: virtual std::string operator()(std::string const & i, std::string const & j) const = 0; };
+
+ static void replace_offset(std::string & str, MorphBase const & morph)
+ {
+ vcl_size_t pos = 0;
+ while ((pos=str.find("$OFFSET", pos))!=std::string::npos)
+ {
+ std::string postprocessed;
+ vcl_size_t pos_po = str.find('{', pos);
+ vcl_size_t pos_pe = str.find('}', pos_po);
+
+ if (MorphBase2D const * p2d = dynamic_cast<MorphBase2D const *>(&morph))
+ {
+ vcl_size_t pos_comma = str.find(',', pos_po);
+ std::string i = str.substr(pos_po + 1, pos_comma - pos_po - 1);
+ std::string j = str.substr(pos_comma + 1, pos_pe - pos_comma - 1);
+ postprocessed = (*p2d)(i, j);
+ }
+ else if (MorphBase1D const * p1d = dynamic_cast<MorphBase1D const *>(&morph))
+ {
+ std::string i = str.substr(pos_po + 1, pos_pe - pos_po - 1);
+ postprocessed = (*p1d)(i);
+ }
+
+ str.replace(pos, pos_pe + 1 - pos, postprocessed);
+ pos = pos_pe;
+ }
+ }
+
+ void register_attribute(std::string & attribute, std::string const & key, std::string const & value)
+ {
+ attribute = value;
+ keywords_[key] = attribute;
+ }
+
+public:
+ struct node_info
+ {
+ node_info(mapping_type const * _mapping, scheduler::statement const * _statement, vcl_size_t _root_idx) :
+ mapping(_mapping), statement(_statement), root_idx(_root_idx) { }
+ mapping_type const * mapping;
+ scheduler::statement const * statement;
+ vcl_size_t root_idx;
+ };
+
+public:
+ mapped_object(std::string const & scalartype, unsigned int id, std::string const & type_key) : type_key_(type_key)
+ {
+ register_attribute(scalartype_, "#scalartype", scalartype);
+ register_attribute(name_, "#name", "obj" + tools::to_string(id));
+ }
+
+ virtual ~mapped_object(){ }
+
+ virtual std::string & append_kernel_arguments(std::set<std::string> &, std::string & str, unsigned int) const { return str; }
+
+ std::string type_key() const { return type_key_; }
+
+ std::string const & name() const { return name_; }
+
+ std::string process(std::string const & in) const
+ {
+ std::string res(in);
+ for (std::map<std::string,std::string>::const_iterator it = keywords_.begin(); it != keywords_.end(); ++it)
+ tools::find_and_replace(res, it->first, it->second);
+ postprocess(res);
+ return res;
+ }
+
+ std::string evaluate(std::map<std::string, std::string> const & accessors) const
+ {
+ if (accessors.find(type_key_)==accessors.end())
+ return name_;
+ return process(at(accessors, type_key_));
+ }
+
+
+protected:
+ std::string name_;
+ std::string scalartype_;
+ std::string type_key_;
+ std::map<std::string, std::string> keywords_;
+};
+
+
+/** @brief Binary leaf interface
+*
+* Some subtrees have to be interpret at leaves when reconstructing the final expression. It is the case of trans(), diag(), prod(), etc...
+* This interface stores basic infos about the sub-trees
+*/
+class binary_leaf
+{
+public:
+ binary_leaf(mapped_object::node_info info) : info_(info){ }
+
+ void process_recursive(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & key, std::string const & process_str, std::set<std::string> & already_fetched)
+ {
+ tree_parsing::process(stream, leaf, key, process_str, *info_.statement, info_.root_idx, *info_.mapping, already_fetched);
+ }
+
+ std::string evaluate_recursive(leaf_t leaf, std::map<std::string, std::string> const & accessors)
+ {
+ return tree_parsing::evaluate(leaf, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+ }
+
+protected:
+ mapped_object::node_info info_;
+};
+
+/** @brief Matrix product
+ *
+ * Maps prod(matrix_expression, matrix_expression)
+ */
+class mapped_matrix_product : public mapped_object, public binary_leaf
+{
+public:
+ mapped_matrix_product(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_product"), binary_leaf(info) { }
+};
+
+/** @brief Reduction
+*
+* Base class for mapping a reduction
+*/
+class mapped_reduction : public mapped_object, public binary_leaf
+{
+public:
+ mapped_reduction(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key) : mapped_object(scalartype, id, type_key), binary_leaf(info){ }
+
+ vcl_size_t root_idx() const { return info_.root_idx; }
+ scheduler::statement const & statement() const { return *info_.statement; }
+ scheduler::statement_node root_node() const { return statement().array()[root_idx()]; }
+ bool is_index_reduction() const { return utils::is_index_reduction(info_.statement->array()[info_.root_idx].op); }
+
+ scheduler::op_element root_op() const
+ {
+ scheduler::op_element res = info_.statement->array()[info_.root_idx].op;
+ if (res.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
+ ||res.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
+ res.type = scheduler::OPERATION_BINARY_ADD_TYPE;
+ return res;
+ }
+};
+
+/** @brief Scalar reduction
+*
+* Maps a scalar reduction (max, min, argmax, inner_prod, etc..)
+*/
+class mapped_scalar_reduction : public mapped_reduction
+{
+public:
+ mapped_scalar_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "scalar_reduction"){ }
+};
+
+/** @brief Vector reduction
+*
+* Maps a row-wise reduction (max, min, argmax, matrix-vector product, etc..)
+*/
+class mapped_row_wise_reduction : public mapped_reduction
+{
+public:
+ mapped_row_wise_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "row_wise_reduction") { }
+};
+
+/** @brief Host scalar
+ *
+ * Maps a host scalar (passed by value)
+ */
+class mapped_host_scalar : public mapped_object
+{
+public:
+ mapped_host_scalar(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "host_scalar"){ }
+
+ std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int width) const
+ {
+ if (already_generated.insert(name_).second)
+ str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_);
+ return str;
+ }
+};
+
+/** @brief Handle
+*
+* Maps an object passed by pointer
+*/
+class mapped_handle : public mapped_object
+{
+private:
+ virtual void append_optional_arguments(std::string &) const = 0;
+
+public:
+ mapped_handle(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_object(scalartype, id, type_key)
+ {
+ register_attribute(pointer_, "#pointer", name_ + "_pointer");
+ }
+
+ std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int width) const
+ {
+ if (already_generated.insert(name_).second)
+ {
+ str += generate_pointer_kernel_argument("__global", utils::append_width(scalartype_, width), pointer_);
+ append_optional_arguments(str);
+ }
+ return str;
+ }
+
+private:
+ std::string pointer_;
+};
+
+
+/** @brief Scalar
+ *
+ * Maps a scalar passed by pointer
+ */
+class mapped_scalar : public mapped_handle
+{
+private:
+ void append_optional_arguments(std::string &) const{ }
+
+public:
+ mapped_scalar(std::string const & scalartype, unsigned int id) : mapped_handle(scalartype, id, "scalar") { }
+};
+
+/** @brief Buffered
+ *
+ * Maps a buffered object (vector, matrix)
+ */
+class mapped_buffer : public mapped_handle
+{
+public:
+ mapped_buffer(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_handle(scalartype, id, type_key){ }
+};
+
+/** @brief Vector
+ *
+ * Maps a vector
+ */
+class mapped_vector : public mapped_buffer
+{
+ void append_optional_arguments(std::string & str) const
+ {
+ str += generate_value_kernel_argument("unsigned int", start_);
+ str += generate_value_kernel_argument("unsigned int", stride_);
+ }
+
+public:
+ mapped_vector(std::string const & scalartype, unsigned int id) : mapped_buffer(scalartype, id, "vector")
+ {
+ register_attribute(start_, "#start", name_ + "_start");
+ register_attribute(stride_, "#stride", name_ + "_stride");
+ }
+
+private:
+ std::string start_;
+ std::string stride_;
+};
+
+/** @brief Matrix
+ *
+ * Maps a matrix
+ */
+class mapped_matrix : public mapped_buffer
+{
+private:
+ void append_optional_arguments(std::string & str) const
+ {
+ str += generate_value_kernel_argument("unsigned int", ld_);
+ str += generate_value_kernel_argument("unsigned int", start1_);
+ str += generate_value_kernel_argument("unsigned int", start2_);
+ str += generate_value_kernel_argument("unsigned int", stride1_);
+ str += generate_value_kernel_argument("unsigned int", stride2_);
+ }
+
+ void postprocess(std::string & str) const
+ {
+ struct Morph : public MorphBase2D
+ {
+ Morph(bool _is_row_major, std::string const & _ld) : is_row_major(_is_row_major), ld(_ld){ }
+ std::string operator()(std::string const & i, std::string const & j) const
+ {
+ if (is_row_major)
+ return "(" + i + ") * " + ld + " + (" + j + ")";
+ return "(" + i + ") + (" + j + ") * " + ld;
+ }
+ private:
+ bool is_row_major;
+ std::string const & ld;
+ };
+ replace_offset(str, Morph(row_major_, ld_));
+ }
+
+public:
+ mapped_matrix(std::string const & scalartype, unsigned int id, bool row_major) : mapped_buffer(scalartype, id, "matrix"), row_major_(row_major)
+ {
+ register_attribute(ld_, "#ld", name_ + "_ld");
+ register_attribute(start1_, "#start1", name_ + "_start1");
+ register_attribute(start2_, "#start2", name_ + "_start2");
+ register_attribute(stride1_, "#stride1", name_ + "_stride1");
+ register_attribute(stride2_, "#stride2", name_ + "_stride2");
+ if (row_major_)
+ keywords_["#nldstride"] = "#stride1";
+ else
+ keywords_["#nldstride"] = "#stride2";
+
+ if (row_major_)
+ {
+ std::swap(start1_, start2_);
+ std::swap(stride1_, stride2_);
+ }
+ }
+
+ bool row_major() const
+ {
+ return row_major_;
+ }
+
+private:
+ std::string ld_;
+ std::string start1_;
+ std::string start2_;
+ std::string stride1_;
+ std::string stride2_;
+ bool row_major_;
+};
+
+/** @brief Vector diag
+*
+* Maps a diag(vector_expression) node into a diagonal matrix
+*/
+class mapped_vector_diag : public mapped_object, public binary_leaf
+{
+private:
+ void postprocess(std::string &res) const
+ {
+ std::map<std::string, std::string> accessors;
+ tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
+ accessors["vector"] = res;
+ res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+ }
+
+public:
+ mapped_vector_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "vector_diag"), binary_leaf(info){ }
+};
+
+
+/** @brief Trans
+*
+* Maps trans(matrix_expression) into the transposed of matrix_expression
+*/
+class mapped_trans: public mapped_object, public binary_leaf
+{
+private:
+ void postprocess(std::string &res) const
+ {
+ std::map<std::string, std::string> accessors;
+ accessors["matrix"] = res;
+ res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+ }
+
+public:
+ mapped_trans(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_trans"), binary_leaf(info){ }
+};
+
+/** @brief Matrix row
+*
+* Maps row(matrix_expression, scalar_expression) into the scalar_expression's row of matrix_expression
+*/
+class mapped_matrix_row : public mapped_object, binary_leaf
+{
+private:
+ void postprocess(std::string &res) const
+ {
+ std::map<std::string, std::string> accessors;
+ tools::find_and_replace(res, "#row", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
+ accessors["matrix"] = res;
+ res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+ }
+
+public:
+ mapped_matrix_row(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_row"), binary_leaf(info)
+ { }
+};
+
+
+/** @brief Matrix column
+*
+* Maps column(matrix_expression, scalar_expression) into the scalar_expression's column of matrix_expression
+*/
+class mapped_matrix_column : public mapped_object, binary_leaf
+{
+private:
+ void postprocess(std::string &res) const
+ {
+ std::map<std::string, std::string> accessors;
+ tools::find_and_replace(res, "#column", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
+ accessors["matrix"] = res;
+ res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+ }
+
+public:
+ mapped_matrix_column(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_column"), binary_leaf(info)
+ { }
+};
+
+/** @brief Matrix diag
+*
+* Maps a diag(matrix_expression) node into the vector of its diagonal elements
+*/
+class mapped_matrix_diag : public mapped_object, binary_leaf
+{
+private:
+ void postprocess(std::string &res) const
+ {
+ std::map<std::string, std::string> accessors;
+ tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
+ accessors["matrix"] = res;
+ res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
+ }
+
+public:
+ mapped_matrix_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_diag"), binary_leaf(info)
+ { }
+};
+
+/** @brief Implicit vector
+ *
+ * Maps an implicit vector
+ */
+class mapped_implicit_vector : public mapped_object
+{
+public:
+ mapped_implicit_vector(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_vector")
+ { }
+
+ std::string & append_kernel_arguments(std::set<std::string> & /*already_generated*/, std::string & str, unsigned int width) const
+ {
+ str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_);
+ return str;
+ }
+};
+
+/** @brief Implicit matrix
+ *
+ * Maps an implicit matrix
+ */
+class mapped_implicit_matrix : public mapped_object
+{
+public:
+ mapped_implicit_matrix(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_matrix")
+ { }
+
+ std::string & append_kernel_arguments(std::set<std::string> & /*already_generated*/, std::string & str, unsigned int width) const
+ {
+ str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_);
+ return str;
+ }
+};
+
+}
+
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
new file mode 100644
index 0000000..1f082ac
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
@@ -0,0 +1,859 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP
+#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP
+
+/* =========================================================================
+Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+(A list of authors and contributors can be found in the manual)
+
+License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/templates/matrix_product_template.hpp
+*
+* Kernel template for the matrix product operation
+*/
+
+#include <vector>
+
+#include "viennacl/scheduler/forwards.h"
+
+#include "viennacl/detail/matrix_def.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+#include "viennacl/device_specific/templates/template_base.hpp"
+#include "viennacl/device_specific/mapped_objects.hpp"
+#include "viennacl/device_specific/utils.hpp"
+#include "viennacl/device_specific/tree_parsing.hpp"
+#include "viennacl/forwards.h"
+
+#include "viennacl/tools/tools.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+struct matrix_product_parameters : public template_base::parameters_type
+{
+ matrix_product_parameters(unsigned int simd_width
+ , unsigned int local_size_0, unsigned int KL, unsigned int local_size_1
+ , unsigned int ms, unsigned int ks, unsigned int ns
+ , fetching_policy_type A_fetching_policy_param, fetching_policy_type B_fetching_policy_param
+ , unsigned int local_fetch_0_param, unsigned int local_fetch_1_param): template_base::parameters_type(simd_width, local_size_0, local_size_1, 1),
+ kL(KL), mS(ms), kS(ks), nS(ns), A_fetching_policy(A_fetching_policy_param), B_fetching_policy(B_fetching_policy_param),
+ local_fetch_0(local_fetch_0_param), local_fetch_1(local_fetch_1_param),
+ mL(ms*local_size_0), nL(ns*local_size_1){}
+
+ unsigned int kL;
+
+ unsigned int mS;
+ unsigned int kS;
+ unsigned int nS;
+
+ fetching_policy_type A_fetching_policy;
+ fetching_policy_type B_fetching_policy;
+
+ unsigned int local_fetch_0;
+ unsigned int local_fetch_1;
+
+ unsigned int mL;
+ unsigned int nL;
+};
+
+class matrix_product_template : public template_base_impl<matrix_product_template, matrix_product_parameters>
+{
+
+private:
+ unsigned int n_lmem_elements() const
+ {
+ unsigned int N = 0;
+ if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
+ N += p_.kL * (p_.mL+1);
+ if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
+ N += p_.nL * (p_.kL+1);
+ return N;
+ }
+
+ int check_invalid_impl(viennacl::ocl::device const & /*device*/) const
+ {
+ if (p_.A_fetching_policy!=FETCH_FROM_LOCAL && p_.B_fetching_policy!=FETCH_FROM_LOCAL&& (p_.local_fetch_0!=0 || p_.local_fetch_1!=0))
+ return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH;
+
+ if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0)
+ return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
+
+ if (p_.kS > p_.kL)
+ return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL;
+
+ if (!(A_trans_=='N' && B_trans_=='T') && p_.simd_width>1)
+ return TEMPLATE_SIMD_WIDTH_MUST_BE_ONE;
+
+ if (p_.A_fetching_policy==FETCH_FROM_LOCAL || p_.B_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ if ((p_.local_fetch_0*p_.local_fetch_1) !=(p_.local_size_0*p_.local_size_1))
+ return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT;
+ }
+
+ if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ unsigned int bound1 = (A_trans_=='N')?p_.kL:p_.mL;
+ unsigned int bound0 = (A_trans_=='N')?p_.mL:p_.kL;
+
+ if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
+ return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
+
+ if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0)
+ return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE;
+
+ }
+ if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ unsigned int bound1 = (B_trans_=='T')?p_.kL:p_.nL;
+ unsigned int bound0 = (B_trans_=='T')?p_.nL:p_.kL;
+
+ if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
+ return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
+
+ if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0)
+ return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
+
+ }
+
+ return TEMPLATE_VALID;
+ }
+
+ static void parse(scheduler::statement const & s,
+ vcl_size_t & C_idx, leaf_t & C_leaf, vcl_size_t & alpha_idx, leaf_t & alpha_leaf,
+ vcl_size_t & A_idx, leaf_t & A_leaf, bool& A_trans, vcl_size_t & B_idx, leaf_t & B_leaf, bool& B_trans,
+ vcl_size_t & beta_idx, leaf_t & beta_leaf)
+ {
+ using namespace tree_parsing;
+ using namespace scheduler;
+
+ scheduler::statement::container_type const & array = s.array();
+ vcl_size_t root_idx = s.root();
+
+ C_idx = root_idx;
+ C_leaf = LHS_NODE_TYPE;
+
+ vcl_size_t node_add_idx = array[root_idx].rhs.node_index;
+
+ vcl_size_t node_1_idx = array[node_add_idx].lhs.node_index;
+ alpha_idx = node_1_idx;
+ alpha_leaf = RHS_NODE_TYPE;
+
+ vcl_size_t mat_prod_idx = array[node_1_idx].lhs.node_index;
+ if (array[mat_prod_idx].lhs.type_family==MATRIX_TYPE_FAMILY)
+ {
+ A_trans = false;
+ A_idx = mat_prod_idx;
+ }
+ else
+ {
+ A_trans = true;
+ A_idx = array[mat_prod_idx].lhs.node_index;
+ }
+ A_leaf = LHS_NODE_TYPE;
+
+ if (array[mat_prod_idx].rhs.type_family==MATRIX_TYPE_FAMILY)
+ {
+ B_trans = false;
+ B_idx = mat_prod_idx;
+ B_leaf = RHS_NODE_TYPE;
+ }
+ else
+ {
+ B_trans = true;
+ B_idx = array[mat_prod_idx].rhs.node_index;
+ B_leaf = LHS_NODE_TYPE;
+ }
+
+ vcl_size_t node_2_idx = array[node_add_idx].rhs.node_index;
+ beta_idx = node_2_idx;
+ beta_leaf = RHS_NODE_TYPE;
+ }
+
+ void VIENNACL_HANDLE_BOUNDS(bool fallback, utils::kernel_generation_stream & stream, std::string const & inbounds, std::string const & do_if, std::string do_else) const
+ {
+ if (fallback)
+ {
+ stream << "if (" << inbounds << ")" << std::endl;
+ stream.inc_tab();
+ stream << do_if << ";" << std::endl;
+ stream.dec_tab();
+ stream << "else" << std::endl;
+ stream.inc_tab();
+ stream << do_else << ";" << std::endl;
+ stream.dec_tab();
+ }
+ else
+ stream << do_if << ";" << std::endl;
+ }
+
+
+ std::string generate_impl(const std::string &kernel_prefix, const statements_container &statements, const std::vector<mapping_type> &mappings, bool fallback) const
+ {
+ using std::string;
+ using tools::to_string;
+
+ parameters_type pfallback(1, p_.local_size_0, p_.kL, p_.local_size_1, p_.mS, 1, p_.nS, p_.A_fetching_policy, p_.B_fetching_policy, p_.local_fetch_0, p_.local_fetch_1);
+ parameters_type const & p = fallback?pfallback:p_;
+
+#define VIENNACL_MUL_STRIDE1 string(fallback?"*#stride1":"")
+#define VIENNACL_HANDLE_BOUNDS(in_bounds, to_load) (!fallback?string(to_load):string( string(in_bounds) + "?" + string(to_load) + ":0"))
+#define VIENNACL_VSTORE(value, offset, ptr) vstore(p.simd_width, value, offset, ptr)
+
+ string widthstr = tools::to_string(p.simd_width);
+
+ //////////////////
+ /// INIT
+ /// //////////////
+ utils::kernel_generation_stream stream;
+ scheduler::statement const & st = statements.data().front();
+ mapping_type const & mapping = mappings.front();
+
+ bool A_trans = false, B_trans = false;
+ vcl_size_t C_idx=0, alpha_idx=0, A_idx=0, B_idx=0, beta_idx=0;
+ leaf_t C_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
+ parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
+
+ mapped_matrix * C = (mapped_matrix* )at(mapping, mapping_key( C_idx, C_leaf)).get();
+ mapped_host_scalar * alpha = (mapped_host_scalar*)at(mapping, mapping_key(alpha_idx, alpha_leaf)).get();
+ mapped_matrix * A = (mapped_matrix* )at(mapping, mapping_key( A_idx, A_leaf)).get();
+ mapped_matrix * B = (mapped_matrix* )at(mapping, mapping_key( B_idx, B_leaf)).get();
+ mapped_host_scalar * beta = (mapped_host_scalar*)at(mapping, mapping_key( beta_idx, beta_leaf)).get();
+
+ //////////////////
+ /// DECLARATIONS
+ /// //////////////
+
+ stream << " __attribute__((reqd_work_group_size(" << p.local_size_0 << "," << p.local_size_1 << ",1)))" << std::endl;
+ std::map<std::string, unsigned int> widths;
+ widths[A->name()] = p.simd_width;
+ widths[B->name()] = p.simd_width;
+ generate_prototype(stream, kernel_prefix, "unsigned int M, unsigned int N, unsigned int K, ", mappings, statements, widths);
+ stream << "{" << std::endl;
+ stream.inc_tab();
+ if(!fallback)
+ {
+ stream << A->process("#start1 /= " + to_string(p.simd_width) + ";") << std::endl;
+ stream << A->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl;
+ stream << B->process("#start1/= " + to_string(p.simd_width) + ";") << std::endl;
+ stream << B->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl;
+ }
+ tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#pointer += $OFFSET{#start1, #start2};", statements, mappings);
+ tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#ld *= #nldstride;", statements, mappings);
+
+ ///Result Values
+ stream << C->process("#scalartype rC[" + to_string(p.mS) + "][" + to_string(p.nS) + "] = {{(#scalartype)0}};") << std::endl;
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+ stream << A->process("#scalartype rA[" + to_string(p.kS) + "][" + to_string(p.mS) + "];") << std::endl;
+ else
+ stream << A->process(utils::append_width("#scalartype",p.simd_width) + " rA[" + to_string(p.kS) + "][" + to_string(p.mS/p.simd_width) + "];") << std::endl;
+ if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+ stream << B->process("#scalartype rB[" + to_string(p.kS) + "][" + to_string(p.nS) + "];");
+ else
+ stream << B->process(utils::append_width("#scalartype",p.simd_width) + " rB[" + to_string(p.kS) + "][" + to_string(p.nS/p.simd_width) + "];") << std::endl;
+
+
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+ stream << A->process("__local #scalartype lA[" + to_string(p.kL*(p.mL+1)) + "];");
+ if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+ stream << B->process("__local #scalartype lB[" + to_string(p.kL*(p.nL+1)) + "];");
+ stream << std::endl;
+
+ stream << "size_t gidx = get_group_id(0);" << std::endl;
+ stream << "size_t gidy = get_group_id(1);" << std::endl;
+ stream << "size_t idx = get_local_id(0);" << std::endl;
+ stream << "size_t idy = get_local_id(1);" << std::endl;
+
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ stream << std::endl;
+ stream << "size_t idt = " << p.local_size_0 << "*idy + idx;" << std::endl;
+ stream << "size_t idxT = idt % " << p.local_fetch_0 << ";" << std::endl;
+ stream << "size_t idyT = idt / " << p.local_fetch_0 << ";" << std::endl;
+ }
+ stream << std::endl;
+
+ if (fallback)
+ {
+ //Bounds checking for M (in A, C)
+ stream << "bool in_bounds_m[" << p.mS << "];" << std::endl;
+ stream << "for(size_t m = 0; m < " << p.mS << "; m++)" << std::endl;
+ stream.inc_tab();
+ switch (p.A_fetching_policy)
+ {
+ case FETCH_FROM_GLOBAL_CONTIGUOUS:
+ stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx*" << p.mS << " + m < M;" << std::endl;
+ break;
+ default:
+ stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx + m*" << p.local_size_0 << " < M;" << std::endl;
+ break;
+ }
+ stream.dec_tab();
+
+ //Bounds checking for A if Local
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ unsigned int fetch_size = (A_trans_=='N'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
+ stream << "bool in_bounds_m_local[" << p.mL/fetch_size << "];" << std::endl;
+ stream << "for(size_t m = 0; m < " << p.mL/fetch_size << "; m++)" << std::endl;
+ stream.inc_tab();
+ stream << "in_bounds_m_local[m] = gidx*" << p.mL << " + " << (A_trans_=='N'?"idxT":"idyT") << " + m*" << fetch_size << " < M;" << std::endl;
+ stream.dec_tab();
+ }
+
+ //Bounds checking for N (in B, C)
+ stream << "bool in_bounds_n[" << p.nS << "];" << std::endl;
+ stream << "for(size_t n = 0; n < " << p.nS << "; n++)" << std::endl;
+ stream.inc_tab();
+ switch (p.B_fetching_policy)
+ {
+ case FETCH_FROM_GLOBAL_CONTIGUOUS:
+ stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy*" << p.nS << " + n < N;" << std::endl;
+ break;
+ default:
+ stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy + n*" << p.local_size_1 << " < N;" << std::endl;
+ break;
+ }
+ stream.dec_tab();
+
+ //Bounds checking for B if Local
+ if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ unsigned int fetch_size = (B_trans_=='T'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
+ stream << "bool in_bounds_n_local[" << p.nL/fetch_size << "];" << std::endl;
+ stream << "for(size_t n = 0; n < " << p.nL/fetch_size << "; n++)" << std::endl;
+ stream.inc_tab();
+ stream << "in_bounds_n_local[n] = gidy*" << p.nL << " + " << (B_trans_=='T'?"idxT":"idyT") << " + n*" << fetch_size << " < N;" << std::endl;
+ stream.dec_tab();
+ }
+ }
+
+ switch (p.A_fetching_policy)
+ {
+ case FETCH_FROM_LOCAL:
+ if (A_trans_=='N')
+ stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + " + idxT)" + VIENNACL_MUL_STRIDE1 + " + idyT*#ld;") << std::endl;
+ else
+ stream << A->process("#pointer += idxT" + VIENNACL_MUL_STRIDE1 + " + gidx*" + to_string(p.mL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
+ break;
+
+ case FETCH_FROM_GLOBAL_CONTIGUOUS:
+ if (A_trans_=='N')
+ stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+ else
+ stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")*#ld;") << std::endl;
+ break;
+
+ case FETCH_FROM_GLOBAL_STRIDED:
+ if (A_trans_=='N')
+ stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx" + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+ else
+ stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx)*#ld;") << std::endl;
+ break;
+
+ //default: break;
+ }
+
+ switch (p.B_fetching_policy)
+ {
+ case FETCH_FROM_LOCAL:
+ if (B_trans_=='T')
+ stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + " + idxT" + ")" + VIENNACL_MUL_STRIDE1 + " + idyT*#ld;") << std::endl;
+ else
+ stream << B->process("#pointer += idxT" + VIENNACL_MUL_STRIDE1 + " + gidy*" + to_string(p.nL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
+ break;
+
+ case FETCH_FROM_GLOBAL_CONTIGUOUS:
+ if (B_trans_=='T')
+ stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+ else
+ stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")*#ld;") << std::endl;
+ break;
+
+ case FETCH_FROM_GLOBAL_STRIDED:
+ if (B_trans_=='T')
+ stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy" + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+ else
+ stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy)*#ld;") << std::endl;
+ break;
+
+ //default: break;
+ }
+
+ stream << std::endl;
+ stream << "size_t K_size_t = K;" << std::endl;
+ stream << "for(size_t block_k=0; block_k < K_size_t; block_k+=" << p.kL << "){" << std::endl;
+ stream.inc_tab();
+
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ if (A_trans_=='N')
+ stream << A->process("__local #scalartype* plA = lA + idyT*" + to_string(p.mL + 1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
+ else
+ stream << A->process("__local #scalartype* plA = lA + idxT*" + to_string(p.mL + 1) + " + idyT;") << std::endl;
+ }
+
+
+ if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ if (B_trans_=='T')
+ stream << B->process("__local #scalartype* plB = lB + idyT*" + to_string(p.nL+1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
+ else
+ stream << B->process("__local #scalartype* plB = lB + idxT*" + to_string(p.nL+1) + "+ idyT;") <<std::endl;
+ }
+
+
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL)
+ stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
+
+ ///Fetch LHS to Local Memory
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='N')
+ for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
+ for (unsigned int m = 0; m < p.mL; m += p.local_fetch_0*p.simd_width)
+ {
+ string in_bounds = "in_bounds_m_local[" + to_string(m/(p.local_fetch_0*p.simd_width)) + "]";
+ string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
+ stream << A->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(k*(p.mL+1)+m))) << ";" << std::endl;
+ }
+ else if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='T')
+ for (unsigned int k = 0; k < p.mL; k += p.local_fetch_1)
+ for (unsigned int m = 0; m < p.kL; m += p.local_fetch_0*p.simd_width)
+ {
+ string in_bounds = "in_bounds_m_local[" + to_string(k/p.local_fetch_1) + "]";
+ string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
+ stream << A->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(m*(p.mL+1)+k))) << ";" << std::endl;
+ }
+
+ if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='T')
+ for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
+ for (unsigned int n = 0; n < p.nL; n += p.local_fetch_0*p.simd_width)
+ {
+ string in_bounds = "in_bounds_n_local[" + to_string(n/(p.local_fetch_0*p.simd_width)) + "]";
+ string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
+ stream << B->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(k*(p.nL+1)+n))) << ";" << std::endl;
+ }
+ else if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='N')
+ for (unsigned int k = 0; k < p.nL; k += p.local_fetch_1)
+ for (unsigned int n = 0; n < p.kL; n += p.local_fetch_0*p.simd_width)
+ {
+ string in_bounds = "in_bounds_n_local[" + to_string(k/p.local_fetch_1) + "]";
+ string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
+ stream << B->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(n*(p.nL+1)+k))) << ";" << std::endl;
+ }
+
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy == FETCH_FROM_LOCAL)
+ {
+ stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
+ stream << "size_t offA = " << p.simd_width << "*idx;" << std::endl;
+ stream << "size_t offB = " << p.simd_width << "*idy;" << std::endl;
+ }
+
+ if (fallback)
+ stream << "for(size_t k = 0; k < " << p.kL << " && (block_k + k < K_size_t); k+=" << p.kS << "){" << std::endl;
+ else
+ stream << "for(size_t k = 0; k < " << p.kL << "; k+=" << p.kS << "){" << std::endl;
+ stream.inc_tab();
+
+ ///Fetch LHS to registers
+ stream << "#pragma unroll " << p.kS << std::endl;
+ stream << "for(size_t kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
+ stream << "#pragma unroll " << p.mS/p.simd_width << std::endl;
+ stream << "for(size_t mm = 0; mm < " << p.mS/p.simd_width << "; mm++)" << std::endl;
+ stream << "{" << std::endl;
+ stream.inc_tab();
+ switch (p.A_fetching_policy)
+ {
+ case FETCH_FROM_LOCAL:
+ for (unsigned int ss = 0; ss < p.simd_width; ++ss)
+ stream << "rA[kk][mm*" << p.simd_width << "+" << ss << "] = lA[offA + mm*" << p.local_size_0*p.simd_width << "+" << ss << "+ kk*" << (p.mL+1) << "];" << std::endl;
+ break;
+
+ case FETCH_FROM_GLOBAL_CONTIGUOUS:
+ {
+ if (A_trans_=='N')
+ stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+ else
+ stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+ break;
+ }
+
+ case FETCH_FROM_GLOBAL_STRIDED:
+ {
+ if (A_trans_=='N')
+ stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm*" + to_string(p.local_size_0) + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+ else
+ stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld*" + to_string(p.local_size_0) + " + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+ break;
+ }
+
+ //default: break;
+ }
+ stream.dec_tab();
+ stream << "}" << std::endl;
+
+ stream << "#pragma unroll " << p.kS << std::endl;
+ stream << "for(size_t kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
+ stream << "#pragma unroll " << p.nS/p.simd_width << std::endl;
+ stream << "for(size_t nn = 0; nn < " << p.nS/p.simd_width << "; nn++)" << std::endl;
+ stream << "{" << std::endl;
+ stream.inc_tab();
+ switch (p.B_fetching_policy)
+ {
+ case FETCH_FROM_LOCAL:
+ for (unsigned int ss = 0; ss < p.simd_width; ++ss)
+ stream << "rB[kk][nn*" << p.simd_width << "+" << ss << "] = lB[offB + nn*" << p.local_size_1*p.simd_width << "+" << ss << "+ kk*" << (p.nL+1) << "];" << std::endl;
+ break;
+
+ case FETCH_FROM_GLOBAL_CONTIGUOUS:
+ {
+ if (B_trans_=='T')
+ stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+ else
+ stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+ break;
+ }
+
+ case FETCH_FROM_GLOBAL_STRIDED:
+ {
+ if (B_trans_=='T')
+ stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn*" + to_string(p.local_size_1) + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+ else
+ stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld*" + to_string(p.local_size_1) + " + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
+ break;
+ }
+
+ //default: break;
+ }
+ stream.dec_tab();
+ stream << "}" << std::endl;
+
+
+ ///Increment pointers
+ switch (p.A_fetching_policy)
+ {
+ case FETCH_FROM_LOCAL:
+ stream << "offA += " << p.kS*(p.mL+1) << ";" << std::endl;
+ break;
+
+ default:
+ if (A_trans_=='N')
+ stream << A->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl;
+ else
+ stream << A->process("#pointer += " + to_string(p.kS) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+ break;
+ }
+
+
+ switch (p.B_fetching_policy)
+ {
+ case FETCH_FROM_LOCAL:
+ stream << "offB += " << p.kS*(p.nL+1) << ";" << std::endl;
+ break;
+
+ default:
+ if (B_trans_=='T')
+ stream << B->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl;
+ else
+ stream << B->process("#pointer += " + to_string(p.kS) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+ break;
+ }
+
+
+ stream << "#pragma unroll " << p.kS << std::endl;
+ stream << "for(size_t kk = 0; kk <" << p.kS << "; ++kk)" << std::endl;
+ stream << "{" << std::endl;
+ stream.inc_tab();
+ for (unsigned int nn=0; nn < p.nS; ++nn)
+ for (unsigned int mm=0; mm < p.mS; ++mm)
+ {
+ string res_str, lhs_str, rhs_str;
+ res_str = "rC[" + tools::to_string(mm) + "][" + tools::to_string(nn) + "]";
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
+ lhs_str = "rA[kk][" + tools::to_string(mm) + "]";
+ else
+ lhs_str = "rA[kk][" + tools::to_string(mm/p.simd_width) + "].s" + tools::to_string(mm%p.simd_width);
+ if (p.B_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
+ rhs_str = "rB[kk]["+tools::to_string(nn)+"]";
+ else
+ rhs_str = "rB[kk]["+tools::to_string(nn/p.simd_width)+"].s"+tools::to_string(nn%p.simd_width);
+ stream << res_str << "=" << "fma(" << lhs_str << "," << rhs_str << "," << res_str << ");" << std::endl;
+ }
+ stream.dec_tab();
+ stream << "}" << std::endl;
+
+
+
+
+ stream.dec_tab();
+ stream << "}" << std::endl;
+
+ //Increment global pointer if local memory is used
+ //Else, it's incremented directly when fetching
+ if (p.A_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ if (A_trans_=='N')
+ stream << A->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl;
+ else
+ stream << A->process("#pointer += " + to_string(p.kL) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+ }
+
+ if (p.B_fetching_policy==FETCH_FROM_LOCAL)
+ {
+ if (B_trans_=='T')
+ stream << B->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl;
+ else
+ stream << B->process("#pointer += " + to_string(p.kL) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl;
+ }
+
+ stream.dec_tab();
+ stream << "}" << std::endl;
+
+
+ if (C->row_major())
+ {
+ unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
+ unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
+
+ stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#ld;") << std::endl;
+ stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#ld;") << std::endl;
+ stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#stride2;") << std::endl;
+ stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#stride2;") << std::endl;
+
+ for (unsigned int n=0; n < p.nS; ++n)
+ {
+ for (unsigned int m=0; m < p.mS; ++m)
+ {
+ unsigned int ministride1 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_0;
+ string Cj = to_string((m/p.simd_width)*(ministride1*p.simd_width) + m%p.simd_width);
+ if (fallback)
+ {
+ stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl;
+ stream.inc_tab();
+ }
+ stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + "+ #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl;
+ if (fallback)
+ stream.dec_tab();
+ }
+ if ((n+1)%p.simd_width>0 || p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
+ stream << C->process("#pointer += #stride2;") << std::endl;
+ else
+ stream << C->process("#pointer += " + to_string((p.local_size_1*p.simd_width) - (p.simd_width-1)) + "*#stride2;") << std::endl;
+ }
+
+ }
+ else
+ {
+ unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
+ unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
+
+ stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#stride1;") << std::endl;
+ stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#stride1;") << std::endl;
+ stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#ld;") << std::endl;
+ stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#ld;") << std::endl;
+
+ for (unsigned int m=0; m < p.mS; ++m)
+ {
+ for (unsigned int n=0; n < p.nS; ++n)
+ {
+ unsigned int ministride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_1;
+ string Cj = to_string((n/p.simd_width)*(ministride1*p.simd_width) + n%p.simd_width);
+ if (fallback)
+ {
+ stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl;
+ stream.inc_tab();
+ }
+ stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + " + #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl;
+ if (fallback)
+ stream.dec_tab();
+ }
+
+ if ((m+1)%p.simd_width>0 || p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
+ stream << C->process("#pointer += #stride1;") << std::endl;
+ else
+ stream << C->process("#pointer += " + to_string((p.local_size_0*p.simd_width) - (p.simd_width-1)) + "*#stride1;") << std::endl;
+ }
+ }
+
+ stream.dec_tab();
+ stream << "}" << std::endl;
+
+ return stream.str();
+
+#undef VIENNACL_MUL_STRIDE1
+#undef VIENNACL_HANDLE_BOUNDS
+#undef VIENNACL_VSTORE
+ }
+
+ std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
+ {
+ std::vector<std::string> res;
+ res.push_back(generate_impl(kernel_prefix, statements, mappings, false));
+ res.push_back(generate_impl(kernel_prefix, statements, mappings, true));
+ return res;
+ }
+
+ template<class NumericT>
+ void enqueue_block(scheduler::statement & statement,
+ scheduler::lhs_rhs_element& eA, scheduler::lhs_rhs_element& eB, scheduler::lhs_rhs_element& eC, scheduler::lhs_rhs_element& ebeta,
+ matrix_base<NumericT> const & A, matrix_base<NumericT> const & B, matrix_base<NumericT> const & C, NumericT beta,
+ std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix, vcl_size_t id)
+ {
+ if (A.size1()==0 || A.size2()==0 || B.size1()==0 || B.size2()==0 || C.size1()==0 || C.size2()==0)
+ return;
+
+ viennacl::ocl::kernel& kernel = programs[id].program().get_kernel(kernel_prefix);
+
+ kernel.local_work_size(0, p_.local_size_0);
+ kernel.local_work_size(1, p_.local_size_1);
+
+ scheduler::statement::assign_element(eA, A);
+ scheduler::statement::assign_element(eB, B);
+ scheduler::statement::assign_element(eC, C);
+ scheduler::statement::assign_element(ebeta, beta);
+
+ if (id==1)
+ {
+ kernel.global_work_size(0, tools::align_to_multiple(tools::align_to_multiple((unsigned int)C.size1(),p_.mS)/p_.mS, p_.local_size_0));
+ kernel.global_work_size(1, tools::align_to_multiple(tools::align_to_multiple((unsigned int)C.size2(),p_.nS)/p_.nS, p_.local_size_1));
+ }
+ else
+ {
+ kernel.global_work_size(0, C.size1()/p_.mS);
+ kernel.global_work_size(1, C.size2()/p_.nS);
+ }
+ unsigned int current_arg = 0;
+ kernel.arg(current_arg++, cl_uint(C.size1()));
+ kernel.arg(current_arg++, cl_uint(C.size2()));
+ if (A.row_major())
+ kernel.arg(current_arg++, cl_uint(A_trans_=='T'?A.size2():A.size1()));
+ else
+ kernel.arg(current_arg++, cl_uint(A_trans_=='N'?A.size2():A.size1()));
+ set_arguments(statement, kernel, current_arg);
+ viennacl::ocl::enqueue(kernel);
+
+ }
+
+ template<class NumericT>
+ matrix_slice< viennacl::matrix_base<NumericT> > create_slice(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr, scheduler::lhs_rhs_element const & element,
+ vcl_size_t s0_0, vcl_size_t s0_1, vcl_size_t s1_0, vcl_size_t s1_1, bool swap)
+ {
+ matrix_base<NumericT> & M = *(element.*ptr);
+ slice s0(s0_0, 1, s0_1 - s0_0);
+ slice s1(s1_0, 1, s1_1 - s1_0);
+ if (swap)
+ std::swap(s0, s1);
+ return matrix_slice<viennacl::matrix_base<NumericT> >(M, s0, s1);
+ }
+
+ template<class NumericT>
+ void enqueue_impl(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr_matrix,
+ scheduler::statement & statement, scheduler::lhs_rhs_element & A, scheduler::lhs_rhs_element & B, scheduler::lhs_rhs_element & C, scheduler::lhs_rhs_element & beta,
+ NumericT beta_value, std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix)
+ {
+ using namespace device_specific::utils;
+ vcl_size_t ldstrideA = call_on_matrix(A, leading_stride());
+ vcl_size_t ldstrideB = call_on_matrix(B, leading_stride());
+ vcl_size_t ldstrideC = call_on_matrix(C, leading_stride());
+ vcl_size_t ldstartA = call_on_matrix(A, leading_start());
+ vcl_size_t ldstartB = call_on_matrix(B, leading_start());
+ bool swap_A = ((A_trans_=='T') ^ utils::call_on_matrix(A, row_major_fun()));
+ bool swap_B = ((B_trans_=='T') ^ utils::call_on_matrix(B, row_major_fun()));
+
+ vcl_size_t M = call_on_matrix(C, size1_fun());
+ vcl_size_t N = call_on_matrix(C, size2_fun());
+ vcl_size_t K;
+ if (utils::call_on_matrix(A, row_major_fun()))
+ K = A_trans_=='T'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
+ else
+ K = A_trans_=='N'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
+
+ if (M < p_.mL || N < p_.nL || K < p_.kL || ldstrideA> 1 || ldstrideB > 1 || ldstrideC > 1 ||
+ (p_.simd_width>1 && (ldstartA % p_.simd_width > 0 || ldstartB % p_.simd_width > 0)))
+ {
+ enqueue_block(statement, A, B, C, beta, create_slice(ptr_matrix, A, 0, M, 0, K, swap_A),
+ create_slice(ptr_matrix, B, 0, K, 0, N, swap_B),
+ create_slice(ptr_matrix, C, 0, M, 0, N, false), beta_value, programs, kernel_prefix, 1);
+ return;
+ }
+
+
+ scheduler::lhs_rhs_element Acopy = A;
+ scheduler::lhs_rhs_element Bcopy = B;
+ scheduler::lhs_rhs_element Ccopy = C;
+
+ vcl_size_t lM = M / p_.mL * p_.mL;
+ vcl_size_t lN = N / p_.nL * p_.nL;
+ vcl_size_t lK = K / p_.kL * p_.kL;
+
+
+ enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), beta_value, programs, kernel_prefix, 0);
+ enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1);
+
+ enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), beta_value, programs, kernel_prefix, 1);
+ enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), (NumericT)1, programs, kernel_prefix, 1);
+
+ enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), beta_value, programs, kernel_prefix, 1);
+ enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1);
+
+ enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), beta_value, programs, kernel_prefix, 1);
+ enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), (NumericT)1, programs, kernel_prefix, 1);
+ }
+
+public:
+ matrix_product_template(matrix_product_template::parameters_type const & parameters, char A_trans, char B_trans) : template_base_impl<matrix_product_template, matrix_product_parameters>(parameters, BIND_ALL_UNIQUE), A_trans_(A_trans), B_trans_(B_trans){ }
+
+ virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
+ {
+ using namespace device_specific::utils;
+ using namespace tree_parsing;
+
+ scheduler::statement const & st = statements.data().front();
+ bool A_trans, B_trans;
+ vcl_size_t C_idx=0, A_idx=0, B_idx=0, alpha_idx=0, beta_idx = 0;
+ leaf_t C_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
+ parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
+
+ scheduler::statement stcopy = st;
+ scheduler::lhs_rhs_element& A = utils::lhs_rhs_element(stcopy, A_idx, A_leaf);
+ scheduler::lhs_rhs_element& B = utils::lhs_rhs_element(stcopy, B_idx, B_leaf);
+ scheduler::lhs_rhs_element& C = utils::lhs_rhs_element(stcopy, C_idx, C_leaf);
+ scheduler::lhs_rhs_element& beta = utils::lhs_rhs_element(stcopy, beta_idx, beta_leaf);
+
+
+
+
+
+
+ if (C.numeric_type==scheduler::FLOAT_TYPE)
+ enqueue_impl<float>(&scheduler::lhs_rhs_element::matrix_float, stcopy, A, B, C, beta, beta.host_float, programs, kernel_prefix);
+ else if (C.numeric_type==scheduler::DOUBLE_TYPE)
+ enqueue_impl<double>(&scheduler::lhs_rhs_element::matrix_double, stcopy, A, B, C, beta, beta.host_double, programs, kernel_prefix);
+ else
+ throw generator_not_supported_exception("GEMM only supported for float/double");
+
+ }
+
+private:
+ const char A_trans_;
+ const char B_trans_;
+};
+
+}
+
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
new file mode 100644
index 0000000..40e3168
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
@@ -0,0 +1,596 @@
+#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_
+#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/device_specific/templates/template_base.hpp
+ *
+ * Base classes for the profiles
+*/
+
+#include <list>
+#include <set>
+
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/device_utils.hpp"
+
+#include "viennacl/scheduler/forwards.h"
+#include "viennacl/scheduler/io.hpp"
+
+#include "viennacl/device_specific/lazy_program_compiler.hpp"
+#include "viennacl/device_specific/mapped_objects.hpp"
+#include "viennacl/device_specific/tree_parsing.hpp"
+#include "viennacl/device_specific/utils.hpp"
+
+namespace viennacl
+{
+namespace device_specific
+{
+
+enum fetching_policy_type
+{
+ FETCH_FROM_LOCAL,
+ FETCH_FROM_GLOBAL_STRIDED,
+ FETCH_FROM_GLOBAL_CONTIGUOUS
+};
+
+class template_base
+{
+public:
+ struct parameters_type
+ {
+ parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels) : simd_width(_simd_width), local_size_0(_local_size_1), local_size_1(_local_size_2), num_kernels(_num_kernels){ }
+
+ unsigned int simd_width;
+ unsigned int local_size_0;
+ unsigned int local_size_1;
+ unsigned int num_kernels;
+ };
+
+private:
+ /** @brief Functor to map the statements to the types defined in mapped_objects.hpp */
+ class map_functor : public tree_parsing::traversal_functor
+ {
+
+ scheduler::statement_node_numeric_type numeric_type(scheduler::statement const * statement, vcl_size_t root_idx) const
+ {
+ scheduler::statement_node const * root_node = &statement->array()[root_idx];
+ while (root_node->lhs.numeric_type==scheduler::INVALID_NUMERIC_TYPE)
+ root_node = &statement->array()[root_node->lhs.node_index];
+ return root_node->lhs.numeric_type;
+ }
+
+ public:
+ typedef tools::shared_ptr<mapped_object> result_type;
+
+ map_functor(symbolic_binder & binder, mapping_type & mapping) : binder_(binder), mapping_(mapping){ }
+
+ /** @brief Binary leaf */
+ template<class T>
+ result_type binary_leaf(scheduler::statement const * statement, vcl_size_t root_idx, mapping_type const * mapping) const
+ {
+ return result_type(new T(utils::numeric_type_to_string(numeric_type(statement,root_idx)), binder_.get(NULL), mapped_object::node_info(mapping, statement, root_idx)));
+ }
+
+ template<class NumericT>
+ result_type operator()(NumericT const & /*scalar*/) const
+ {
+ return result_type(new mapped_host_scalar(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
+ }
+
+ /** @brief Scalar mapping */
+ template<class NumericT>
+ result_type operator()(scalar<NumericT> const & scal) const
+ {
+ return result_type(new mapped_scalar(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(scal))));
+ }
+
+ /** @brief Vector mapping */
+ template<class NumericT>
+ result_type operator()(vector_base<NumericT> const & vec) const
+ {
+ return result_type(new mapped_vector(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(vec))));
+ }
+
+ /** @brief Implicit vector mapping */
+ template<class NumericT>
+ result_type operator()(implicit_vector_base<NumericT> const & /*vec*/) const
+ {
+ return result_type(new mapped_implicit_vector(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
+ }
+
+ /** @brief Matrix mapping */
+ template<class NumericT>
+ result_type operator()(matrix_base<NumericT> const & mat) const
+ {
+ return result_type(new mapped_matrix(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(mat)),
+ viennacl::traits::row_major(mat)));
+ }
+
+ /** @brief Implicit matrix mapping */
+ template<class NumericT>
+ result_type operator()(implicit_matrix_base<NumericT> const & /*mat*/) const
+ {
+ return result_type(new mapped_implicit_matrix(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
+ }
+
+ /** @brief Traversal functor */
+ void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf_t) const {
+ mapping_type::key_type key(root_idx, leaf_t);
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+
+ if (leaf_t == LHS_NODE_TYPE && root_node.lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+ mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.lhs, *this)));
+ else if (leaf_t == RHS_NODE_TYPE && root_node.rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+ mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.rhs, *this)));
+ else if ( leaf_t== PARENT_NODE_TYPE)
+ {
+ if (root_node.op.type==scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE)
+ mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_vector_diag>(&statement, root_idx, &mapping_)));
+ else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE)
+ mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_diag>(&statement, root_idx, &mapping_)));
+ else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE)
+ mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_row>(&statement, root_idx, &mapping_)));
+ else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE)
+ mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_column>(&statement, root_idx, &mapping_)));
+ else if (is_scalar_reduction(root_node))
+ mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_scalar_reduction>(&statement, root_idx, &mapping_)));
+ else if (is_vector_reduction(root_node))
+ mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_row_wise_reduction>(&statement, root_idx, &mapping_)));
+ else if (root_node.op.type == scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE)
+ mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_product>(&statement, root_idx, &mapping_)));
+ else if (root_node.op.type == scheduler::OPERATION_UNARY_TRANS_TYPE)
+ mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_trans>(&statement, root_idx, &mapping_)));
+ }
+ }
+
+ private:
+ symbolic_binder & binder_;
+ mapping_type & mapping_;
+ };
+
+ /** @brief functor for generating the prototype of a statement */
+ class prototype_generation_traversal : public tree_parsing::traversal_functor
+ {
+ private:
+ std::set<std::string> & already_generated_;
+ std::string & str_;
+ mapping_type const & mapping_;
+ std::map<std::string, unsigned int> const & widths_;
+ public:
+ prototype_generation_traversal(std::set<std::string> & already_generated, std::string & str, mapping_type const & mapping, std::map<std::string, unsigned int> const & widths) :
+ already_generated_(already_generated), str_(str), mapping_(mapping), widths_(widths){ }
+
+ void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf) const
+ {
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+ if ( (leaf==LHS_NODE_TYPE && root_node.lhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY)
+ ||(leaf==RHS_NODE_TYPE && root_node.rhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY) )
+ {
+ mapped_object * obj = at(mapping_, std::make_pair(root_idx,leaf)).get();
+ if(widths_.find(obj->name())!=widths_.end())
+ obj->append_kernel_arguments(already_generated_, str_, at(widths_, obj->name()));
+ else
+ obj->append_kernel_arguments(already_generated_, str_, 1);
+ }
+ }
+ };
+
+
+
+ /** @brief functor for setting the arguments of a kernel */
+ class set_arguments_functor : public tree_parsing::traversal_functor
+ {
+ public:
+ typedef void result_type;
+
+ set_arguments_functor(symbolic_binder & binder, unsigned int & current_arg, viennacl::ocl::kernel & kernel) : binder_(binder), current_arg_(current_arg), kernel_(kernel){ }
+
+ template<class NumericT>
+ result_type operator()(NumericT const & scal) const {
+ typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype;
+ kernel_.arg(current_arg_++, cl_scalartype(scal));
+ }
+
+ /** @brief Scalar mapping */
+ template<class NumericT>
+ result_type operator()(scalar<NumericT> const & scal) const {
+ if (binder_.bind(&viennacl::traits::handle(scal)))
+ kernel_.arg(current_arg_++, scal.handle().opencl_handle());
+ }
+
+ /** @brief Vector mapping */
+ template<class NumericT>
+ result_type operator()(vector_base<NumericT> const & vec) const {
+ if (binder_.bind(&viennacl::traits::handle(vec)))
+ {
+ kernel_.arg(current_arg_++, vec.handle().opencl_handle());
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start(vec)));
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride(vec)));
+ }
+ }
+
+ /** @brief Implicit vector mapping */
+ template<class NumericT>
+ result_type operator()(implicit_vector_base<NumericT> const & vec) const
+ {
+ typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype;
+ kernel_.arg(current_arg_++, cl_scalartype(vec.value()));
+ if (vec.has_index())
+ kernel_.arg(current_arg_++, cl_uint(vec.index()));
+ }
+
+ /** @brief Matrix mapping */
+ template<class NumericT>
+ result_type operator()(matrix_base<NumericT> const & mat) const
+ {
+ if (binder_.bind(&viennacl::traits::handle(mat)))
+ {
+ kernel_.arg(current_arg_++, mat.handle().opencl_handle());
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::ld(mat)));
+ if (mat.row_major())
+ {
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
+ }
+ else
+ {
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
+ kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
+ }
+ }
+ }
+
+ /** @brief Implicit matrix mapping */
+ template<class NumericT>
+ result_type operator()(implicit_matrix_base<NumericT> const & mat) const
+ {
+ kernel_.arg(current_arg_++, typename viennacl::result_of::cl_type<NumericT>::type(mat.value()));
+ }
+
+ /** @brief Traversal functor: */
+ void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf_t) const
+ {
+ scheduler::statement_node const & root_node = statement.array()[root_idx];
+ if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+ utils::call_on_element(root_node.lhs, *this);
+ else if (leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY)
+ utils::call_on_element(root_node.rhs, *this);
+ }
+
+ private:
+ symbolic_binder & binder_;
+ unsigned int & current_arg_;
+ viennacl::ocl::kernel & kernel_;
+ };
+
+protected:
+
+ static void generate_prototype(utils::kernel_generation_stream & stream, std::string const & name, std::string const & first_arguments, std::vector<mapping_type> const & mappings, statements_container const &statements,
+ std::map<std::string, unsigned int> const & widths)
+ {
+ statements_container::data_type::const_iterator sit;
+ std::vector<mapping_type>::const_iterator mit;
+ std::set<std::string> already_generated;
+
+ std::string arguments = first_arguments;
+ for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++sit, ++mit)
+ tree_parsing::traverse(*sit, sit->root(), prototype_generation_traversal(already_generated, arguments, *mit, widths), true);
+ arguments.erase(arguments.size()-1); //Last comma pruned
+ stream << "__kernel " << "void " << name << "(" << arguments << ")" << std::endl;
+ }
+
+ static void generate_prototype(utils::kernel_generation_stream & stream, std::string const & name, std::string const & first_arguments, std::vector<mapping_type> const & mappings, statements_container const & statements)
+ {
+ generate_prototype(stream, name, first_arguments, mappings, statements, std::map<std::string, unsigned int>());
+ }
+
+ void set_arguments(statements_container const & statements, viennacl::ocl::kernel & kernel, unsigned int & current_arg)
+ {
+ tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
+ for (statements_container::data_type::const_iterator itt = statements.data().begin(); itt != statements.data().end(); ++itt)
+ tree_parsing::traverse(*itt, itt->root(), set_arguments_functor(*binder,current_arg,kernel), true);
+ }
+
+ class invalid_template_exception : public std::exception
+ {
+ public:
+ invalid_template_exception() : message_() {}
+ invalid_template_exception(std::string message) :
+ message_("ViennaCL: Internal error: The generator cannot apply the given template to the given statement: " + message + "\n"
+ "If you are using a builtin template, please report on viennacl-support@lists.sourceforge.net! We will provide a fix as soon as possible\n"
+ "If you are using your own template, please try using other parameters") {}
+ virtual const char* what() const throw() { return message_.c_str(); }
+ virtual ~invalid_template_exception() throw() {}
+ private:
+ std::string message_;
+ };
+
+ static void fetching_loop_info(fetching_policy_type policy, std::string const & bound, utils::kernel_generation_stream & stream, std::string & init, std::string & upper_bound, std::string & inc, std::string const & domain_id, std::string const & domain_size)
+ {
+ if (policy==FETCH_FROM_GLOBAL_STRIDED)
+ {
+ init = domain_id;
+ upper_bound = bound;
+ inc = domain_size;
+ }
+ else if (policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
+ {
+ std::string chunk_size = "chunk_size";
+ std::string chunk_start = "chunk_start";
+ std::string chunk_end = "chunk_end";
+
+ stream << "unsigned int " << chunk_size << " = (" << bound << "+" << domain_size << "-1)/" << domain_size << ";" << std::endl;
+ stream << "unsigned int " << chunk_start << " =" << domain_id << "*" << chunk_size << ";" << std::endl;
+ stream << "unsigned int " << chunk_end << " = min(" << chunk_start << "+" << chunk_size << ", " << bound << ");" << std::endl;
+ init = chunk_start;
+ upper_bound = chunk_end;
+ inc = "1";
+ }
+ }
+
+ static bool is_node_trans(scheduler::statement::container_type const & array, vcl_size_t root_idx, leaf_t leaf_type)
+ {
+ bool res = false;
+ scheduler::lhs_rhs_element scheduler::statement_node::*ptr;
+ if (leaf_type==LHS_NODE_TYPE)
+ ptr = &scheduler::statement_node::lhs;
+ else
+ ptr = &scheduler::statement_node::rhs;
+ scheduler::statement_node const * node = &array[root_idx];
+ while ((node->*ptr).type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
+ {
+ if (array[(node->*ptr).node_index].op.type==scheduler::OPERATION_UNARY_TRANS_TYPE)
+ res = !res;
+ node = &array[(node->*ptr).node_index];
+ }
+ return res;
+ }
+
+protected:
+
+ static std::string append_simd_suffix(std::string const & str, unsigned int i)
+ {
+ assert(i < 16);
+ static char suffixes[] = {'0','1','2','3','4','5','6','7','8','9',
+ 'a','b','c','d','e','f'};
+ return str + tools::to_string(suffixes[i]);
+ }
+
+ static bool is_striding_operator(scheduler::statement_node const & node)
+ {
+ return node.op.type==scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE
+ || node.op.type==scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE
+ || node.op.type==scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE;
+ }
+
+ static bool has_strided_access(statements_container const & statements)
+ {
+ for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
+ {
+ //checks for vectors
+ std::vector<scheduler::lhs_rhs_element> vectors;
+ tree_parsing::traverse(*it, it->root(), tree_parsing::filter_elements(scheduler::DENSE_VECTOR_TYPE, vectors), true);
+ for (std::vector<scheduler::lhs_rhs_element>::iterator itt = vectors.begin(); itt != vectors.end(); ++itt)
+ if (utils::call_on_vector(*itt, utils::stride_fun())>1)
+ return true;
+
+ //checks for matrix
+ std::vector<scheduler::lhs_rhs_element> matrices;
+ tree_parsing::traverse(*it, it->root(), tree_parsing::filter_elements(scheduler::DENSE_MATRIX_TYPE, matrices), true);
+ for (std::vector<scheduler::lhs_rhs_element>::iterator itt = matrices.begin(); itt != matrices.end(); ++itt)
+ if (utils::call_on_matrix(*itt, utils::stride1_fun())>1 || utils::call_on_matrix(*itt, utils::stride2_fun())>2)
+ return true;
+
+ std::vector<vcl_size_t> striding_operators;
+ tree_parsing::traverse(*it, it->root(), tree_parsing::filter(&is_striding_operator, striding_operators), false);
+ if(striding_operators.size() > 0)
+ return true;
+ }
+ return false;
+ }
+
+ static vcl_size_t vector_size(scheduler::statement_node const & node, bool up_to_internal_size)
+ {
+ using namespace scheduler;
+ using namespace utils;
+ if (node.op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE)
+ {
+ vcl_size_t size1 = up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
+ vcl_size_t size2 = up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
+ return std::min<vcl_size_t>(size1, size2);
+ }
+ else if (node.op.type==OPERATION_BINARY_MATRIX_ROW_TYPE)
+ return up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
+ else if (node.op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE)
+ return up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
+ else
+ return up_to_internal_size?call_on_vector(node.lhs, internal_size_fun()):call_on_vector(node.lhs, size_fun());
+ }
+
+ //NB : templates are not used here because declaring a functor out of the generate() functions would be harder to read
+ struct loop_body_base
+ {
+ virtual void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const = 0;
+ virtual ~loop_body_base() {}
+ };
+
+ static void element_wise_loop_1D(utils::kernel_generation_stream & stream, loop_body_base const & loop_body,
+ fetching_policy_type fetch, unsigned int simd_width, std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size)
+ {
+ std::string strwidth = tools::to_string(simd_width);
+ std::string boundround = bound + "/" + strwidth;
+
+ std::string init, upper_bound, inc;
+ fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, domain_id, domain_size);
+ stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " << upper_bound << "; " << i << " += " << inc << ")" << std::endl;
+ stream << "{" << std::endl;
+ stream.inc_tab();
+ loop_body(stream, simd_width);
+ stream.dec_tab();
+ stream << "}" << std::endl;
+
+ if (simd_width>1)
+ {
+ stream << "for(unsigned int " << i << " = " << boundround << "*" << strwidth << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl;
+ stream << "{" << std::endl;
+ stream.inc_tab();
+ loop_body(stream, 1);
+ stream.dec_tab();
+ stream << "}" << std::endl;
+ }
+ }
+
+ static std::string vstore(unsigned int simd_width, std::string const & value, std::string const & offset, std::string const & ptr)
+ {
+ if (simd_width==1)
+ return "(" + ptr + ")[" + offset + "] = " + value;
+ else
+ return utils::append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")";
+ }
+
+ static std::string vload(unsigned int simd_width, std::string const & offset, std::string const & ptr)
+ {
+ if (simd_width==1)
+ return "(" + ptr + ")[" + offset + "]";
+ else
+ return utils::append_width("vload", simd_width) + "(" + offset + ", " + ptr + ")";
+ }
+
+private:
+ /** @brief Generates the body of the associated kernel function */
+ virtual std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mapping) const = 0;
+
+public:
+ template_base(binding_policy_t binding_policy) : binding_policy_(binding_policy) {}
+
+ virtual ~template_base(){ }
+
+ std::vector<std::string> generate(std::string const & kernel_prefix, statements_container const & statements, viennacl::ocl::device const & device)
+ {
+ statements_container::data_type::const_iterator sit;
+ std::vector<mapping_type>::iterator mit;
+
+ if(int err = check_invalid(statements, device))
+ throw generator_not_supported_exception("The supplied parameters for this template are invalid : err " + tools::to_string(err));
+
+ //Create mapping
+ std::vector<mapping_type> mappings(statements.data().size());
+ tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
+ for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++sit, ++mit)
+ tree_parsing::traverse(*sit, sit->root(), map_functor(*binder,*mit), true);
+
+ return generate_impl(kernel_prefix, statements, mappings);
+ }
+
+ /** @brief returns whether or not the profile has undefined behavior on particular device */
+ virtual int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const = 0;
+
+ virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements) = 0;
+
+ virtual tools::shared_ptr<template_base> clone() const = 0;
+private:
+ binding_policy_t binding_policy_;
+};
+
+
+template<class TemplateType, class ParametersType>
+class template_base_impl : public template_base
+{
+private:
+ virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const { return TEMPLATE_VALID; }
+
+ virtual unsigned int n_lmem_elements() const { return 0; }
+
+public:
+ typedef ParametersType parameters_type;
+
+ /** @brief The constructor */
+ template_base_impl(parameters_type const & parameters, binding_policy_t binding_policy) : template_base(binding_policy), p_(parameters){ }
+
+ parameters_type const & parameters() const
+ {
+ return p_;
+ }
+
+ tools::shared_ptr<template_base> clone() const
+ {
+ return tools::shared_ptr<template_base>(new TemplateType(*dynamic_cast<TemplateType const *>(this)));
+ }
+
+ /** @brief returns whether or not the profile has undefined behavior on particular device */
+ int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const
+ {
+ using namespace viennacl::tools;
+
+ scheduler::statement const & statement = statements.data().front();
+ unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
+
+ //Query device informations
+ vcl_size_t lmem_available = static_cast<vcl_size_t>(device.local_mem_size());
+ vcl_size_t lmem_usage = scalartype_size*n_lmem_elements();
+ if (lmem_usage>lmem_available)
+ return TEMPLATE_LOCAL_MEMORY_OVERFLOW;
+
+ //Invalid work group size
+ vcl_size_t max_workgroup_size = device.max_work_group_size();
+ std::vector<vcl_size_t> max_work_item_sizes = device.max_work_item_sizes();
+ if (p_.local_size_0*p_.local_size_1 > max_workgroup_size)
+ return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW;
+ if (p_.local_size_0 > max_work_item_sizes[0])
+ return TEMPLATE_LOCAL_SIZE_0_OVERFLOW;
+
+ if (p_.local_size_1 > max_work_item_sizes[1])
+ return TEMPLATE_LOCAL_SIZE_1_OVERFLOW;
+
+ //Advice from the Intel guide
+ unsigned int warp_size = 8;
+ if (device.type()==CL_DEVICE_TYPE_GPU)
+ {
+ //Advice from the nvidia guide
+ warp_size = 32;
+ //Advice from the AMD guide
+ if (device.vendor_id()==4098)
+ warp_size = 64;
+ }
+ if (((p_.local_size_0*p_.local_size_1)%warp_size)>0)
+ return TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE;
+
+ //Invalid SIMD Width
+ if (p_.simd_width!=1 && p_.simd_width!=2 &&
+ p_.simd_width!=4 && p_.simd_width!=8 &&
+ p_.simd_width!=16)
+ return TEMPLATE_INVALID_SIMD_WIDTH;
+
+ return check_invalid_impl(device);
+ }
+
+protected:
+ parameters_type p_;
+};
+
+}
+}
+
+#endif
[10/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
new file mode 100644
index 0000000..8645e7d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
@@ -0,0 +1,1703 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/compressed_matrix.hpp
+ * @brief OpenCL kernel file for compressed_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_compressed_matrix_block_trans_lu_backward(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void block_trans_lu_backward( \n");
+ source.append(" __global const unsigned int * row_jumper_U, \n"); //U part (note that U is transposed in memory)
+ source.append(" __global const unsigned int * column_indices_U, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements_U, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n");
+ source.append(" __global const unsigned int * block_offsets, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
+ source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
+ source.append(" unsigned int row_start; \n");
+ source.append(" unsigned int row_stop; \n");
+ source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n");
+
+ source.append(" if (col_start >= col_stop) \n");
+ source.append(" return; \n");
+
+ //backward elimination, using U and diagonal_U
+ source.append(" for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
+ source.append(" { \n");
+ source.append(" unsigned int col = (col_stop - iter) - 1; \n");
+ source.append(" result_entry = result[col] / diagonal_U[col]; \n");
+ source.append(" row_start = row_jumper_U[col]; \n");
+ source.append(" row_stop = row_jumper_U[col + 1]; \n");
+ source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
+ source.append(" result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ //divide result vector by diagonal:
+ source.append(" for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
+ source.append(" result[col] /= diagonal_U[col]; \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_block_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void block_trans_unit_lu_forward( \n");
+ source.append(" __global const unsigned int * row_jumper_L, \n"); //L part (note that L is transposed in memory)
+ source.append(" __global const unsigned int * column_indices_L, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements_L, \n");
+ source.append(" __global const unsigned int * block_offsets, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
+ source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n");
+ source.append(" unsigned int row_start = row_jumper_L[col_start]; \n");
+ source.append(" unsigned int row_stop; \n");
+ source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n");
+
+ source.append(" if (col_start >= col_stop) \n");
+ source.append(" return; \n");
+
+ //forward elimination, using L:
+ source.append(" for (unsigned int col = col_start; col < col_stop; ++col) \n");
+ source.append(" { \n");
+ source.append(" result_entry = result[col]; \n");
+ source.append(" row_stop = row_jumper_L[col + 1]; \n");
+ source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
+ source.append(" result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
+ source.append(" row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM)
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ source.append("}; \n");
+}
+
+namespace detail
+{
+ /** @brief Generate kernel for C = A * B with A being a compressed_matrix, B and C dense */
+ template<typename StringT>
+ void generate_compressed_matrix_dense_matrix_mult(StringT & source, std::string const & numeric_string,
+ bool B_transposed, bool B_row_major, bool C_row_major)
+ {
+ source.append("__kernel void ");
+ source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+ source.append("( \n");
+ source.append(" __global const unsigned int * sp_mat_row_indices, \n");
+ source.append(" __global const unsigned int * sp_mat_col_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
+ source.append(" unsigned int d_mat_row_start, \n");
+ source.append(" unsigned int d_mat_col_start, \n");
+ source.append(" unsigned int d_mat_row_inc, \n");
+ source.append(" unsigned int d_mat_col_inc, \n");
+ source.append(" unsigned int d_mat_row_size, \n");
+ source.append(" unsigned int d_mat_col_size, \n");
+ source.append(" unsigned int d_mat_internal_rows, \n");
+ source.append(" unsigned int d_mat_internal_cols, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int result_row_start, \n");
+ source.append(" unsigned int result_col_start, \n");
+ source.append(" unsigned int result_row_inc, \n");
+ source.append(" unsigned int result_col_inc, \n");
+ source.append(" unsigned int result_row_size, \n");
+ source.append(" unsigned int result_col_size, \n");
+ source.append(" unsigned int result_internal_rows, \n");
+ source.append(" unsigned int result_internal_cols) { \n");
+
+ // split work rows (sparse matrix rows) to thread groups
+ source.append(" for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
+
+ source.append(" unsigned int row_start = sp_mat_row_indices[row]; \n");
+ source.append(" unsigned int row_end = sp_mat_row_indices[row+1]; \n");
+
+ // split result cols between threads in a thread group
+ source.append(" for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" r = 0; \n");
+
+ source.append(" for (unsigned int k = row_start; k < row_end; k ++) { \n");
+
+ source.append(" unsigned int j = sp_mat_col_indices[k]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n");
+
+ source.append(" "); source.append(numeric_string);
+ if (B_transposed && B_row_major)
+ source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n");
+ else if (B_transposed && !B_row_major)
+ source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+ else if (!B_transposed && B_row_major)
+ source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
+ else
+ source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+ source.append(" r += x * y; \n");
+ source.append(" } \n");
+
+ if (C_row_major)
+ source.append(" result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
+ else
+ source.append(" result[ (result_row_start + row * result_row_inc) + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append("} \n");
+
+ }
+}
+template<typename StringT>
+void generate_compressed_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+ detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false);
+ detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, true);
+ detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, false);
+ detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, true);
+
+ detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false);
+ detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, true);
+ detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, false);
+ detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, true);
+}
+
+template<typename StringT>
+void generate_compressed_matrix_jacobi(StringT & source, std::string const & numeric_string)
+{
+
+ source.append(" __kernel void jacobi( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" "); source.append(numeric_string); source.append(" weight, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * old_result, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * new_result, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * rhs, \n");
+ source.append(" unsigned int size) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum, diag=1; \n");
+ source.append(" int col; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" sum = 0; \n");
+ source.append(" for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
+ source.append(" { \n");
+ source.append(" col = column_indices[j]; \n");
+ source.append(" if (i == col) \n");
+ source.append(" diag = elements[j]; \n");
+ source.append(" else \n");
+ source.append(" sum += elements[j] * old_result[col]; \n");
+ source.append(" } \n");
+ source.append(" new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_lu_backward(StringT & source, std::string const & numeric_string)
+{
+ // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void lu_backward( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" __local unsigned int col_index_buffer[128]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+ source.append(" unsigned int nnz = row_indices[size]; \n");
+ source.append(" unsigned int current_row = size-1; \n");
+ source.append(" unsigned int row_at_window_start = size-1; \n");
+ source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n");
+ source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
+ source.append(" unsigned int next_row = row_indices[size-1]; \n");
+
+ source.append(" unsigned int i = loop_end + get_local_id(0); \n");
+ source.append(" while (1) \n");
+ source.append(" { \n");
+ //load into shared memory (coalesced access):
+ source.append(" if (i < nnz) \n");
+ source.append(" { \n");
+ source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
+ source.append(" unsigned int tmp = column_indices[i]; \n");
+ source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
+ source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ //now a single thread does the remaining work in shared memory:
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" { \n");
+ // traverse through all the loaded data from back to front:
+ source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
+ source.append(" { \n");
+ source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n");
+
+ source.append(" if (i+k >= nnz) \n");
+ source.append(" continue; \n");
+
+ source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
+ source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+ source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data
+ source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+ source.append(" else if (col_index_buffer[k] == current_row) \n");
+ source.append(" diagonal_entry = element_buffer[k]; \n");
+
+ source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
+ source.append(" { \n");
+ source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n");
+ source.append(" if (current_row > 0) //load next row's data \n");
+ source.append(" { \n");
+ source.append(" --current_row; \n");
+ source.append(" next_row = row_indices[current_row]; \n");
+ source.append(" current_vector_entry = vector[current_row]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+
+ source.append(" } \n"); // for k
+
+ source.append(" row_at_window_start = current_row; \n");
+ source.append(" } \n"); // if (get_local_id(0) == 0)
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+ source.append(" if (i < get_local_size(0)) \n");
+ source.append(" break; \n");
+
+ source.append(" i -= get_local_size(0); \n");
+ source.append(" } \n"); //for i
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+ // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void lu_forward( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" __local unsigned int col_index_buffer[128]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+ source.append(" unsigned int nnz = row_indices[size]; \n");
+ source.append(" unsigned int current_row = 0; \n");
+ source.append(" unsigned int row_at_window_start = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" diagonal_entry; \n");
+ source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
+ source.append(" unsigned int next_row = row_indices[1]; \n");
+
+ source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+ source.append(" { \n");
+ //load into shared memory (coalesced access):
+ source.append(" if (i < nnz) \n");
+ source.append(" { \n");
+ source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
+ source.append(" unsigned int tmp = column_indices[i]; \n");
+ source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
+ source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ //now a single thread does the remaining work in shared memory:
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" { \n");
+ // traverse through all the loaded data:
+ source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
+ source.append(" { \n");
+ source.append(" if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result
+ source.append(" { \n");
+ source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n");
+ source.append(" ++current_row; \n");
+ source.append(" if (current_row < size) \n"); //load next row's data
+ source.append(" { \n");
+ source.append(" next_row = row_indices[current_row+1]; \n");
+ source.append(" current_vector_entry = vector[current_row]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
+ source.append(" { \n");
+ source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
+ source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+ source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data
+ source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+ source.append(" } \n");
+ source.append(" else if (col_index_buffer[k] == current_row) \n");
+ source.append(" diagonal_entry = element_buffer[k]; \n");
+
+ source.append(" } \n"); // for k
+
+ source.append(" row_at_window_start = current_row; \n");
+ source.append(" } \n"); // if (get_local_id(0) == 0)
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } \n"); //for i
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void row_info_extractor( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" unsigned int size, \n");
+ source.append(" unsigned int option \n");
+ source.append(" ) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" value = 0; \n");
+ source.append(" unsigned int row_end = row_indices[row+1]; \n");
+
+ source.append(" switch (option) \n");
+ source.append(" { \n");
+ source.append(" case 0: \n"); //inf-norm
+ source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+ source.append(" value = max(value, fabs(elements[i])); \n");
+ source.append(" break; \n");
+
+ source.append(" case 1: \n"); //1-norm
+ source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+ source.append(" value += fabs(elements[i]); \n");
+ source.append(" break; \n");
+
+ source.append(" case 2: \n"); //2-norm
+ source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+ source.append(" value += elements[i] * elements[i]; \n");
+ source.append(" value = sqrt(value); \n");
+ source.append(" break; \n");
+
+ source.append(" case 3: \n"); //diagonal entry
+ source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+ source.append(" { \n");
+ source.append(" if (column_indices[i] == row) \n");
+ source.append(" { \n");
+ source.append(" value = elements[i]; \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" break; \n");
+
+ source.append(" default: \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" result[row] = value; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+ // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void trans_lu_backward( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" __local unsigned int row_index_lookahead[256]; \n");
+ source.append(" __local unsigned int row_index_buffer[256]; \n");
+
+ source.append(" unsigned int row_index; \n");
+ source.append(" unsigned int col_index; \n");
+ source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
+ source.append(" unsigned int nnz = row_indices[size]; \n");
+ source.append(" unsigned int row_at_window_start = size; \n");
+ source.append(" unsigned int row_at_window_end; \n");
+ source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+ source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int i = (nnz - i2) - 1; \n");
+ source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
+ source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
+ source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (i2 < nnz) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_index_dec = 0; \n");
+ source.append(" while (row_index_lookahead[row_index_dec] > i) \n");
+ source.append(" ++row_index_dec; \n");
+ source.append(" row_index = row_at_window_start - row_index_dec; \n");
+ source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" row_index = size+1; \n");
+ source.append(" row_index_buffer[get_local_id(0)] = 0; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" row_at_window_start = row_index_buffer[0]; \n");
+ source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
+
+ //backward elimination
+ source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row = row_at_window_start - row2; \n");
+ source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
+
+ source.append(" if ( (row_index == row) && (col_index < row) ) \n");
+ source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ source.append(" row_at_window_start = row_at_window_end; \n");
+ source.append(" } \n");
+
+ // final step: Divide vector by diagonal entries:
+ source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
+ source.append(" vector[i] /= diagonal_entries[i]; \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+ // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void trans_lu_forward( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" __local unsigned int row_index_lookahead[256]; \n");
+ source.append(" __local unsigned int row_index_buffer[256]; \n");
+
+ source.append(" unsigned int row_index; \n");
+ source.append(" unsigned int col_index; \n");
+ source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
+ source.append(" unsigned int nnz = row_indices[size]; \n");
+ source.append(" unsigned int row_at_window_start = 0; \n");
+ source.append(" unsigned int row_at_window_end = 0; \n");
+ source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+ source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+ source.append(" { \n");
+ source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n");
+ source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
+ source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (i < nnz) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_index_inc = 0; \n");
+ source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
+ source.append(" ++row_index_inc; \n");
+ source.append(" row_index = row_at_window_start + row_index_inc; \n");
+ source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" row_index = size+1; \n");
+ source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" row_at_window_start = row_index_buffer[0]; \n");
+ source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
+
+ //forward elimination
+ source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
+
+ source.append(" if ( (row_index == row) && (col_index > row) ) \n");
+ source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ source.append(" row_at_window_start = row_at_window_end; \n");
+ source.append(" } \n");
+
+ // final step: Divide vector by diagonal entries:
+ source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
+ source.append(" vector[i] /= diagonal_entries[i]; \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+ // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void trans_unit_lu_backward( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" __local unsigned int row_index_lookahead[256]; \n");
+ source.append(" __local unsigned int row_index_buffer[256]; \n");
+
+ source.append(" unsigned int row_index; \n");
+ source.append(" unsigned int col_index; \n");
+ source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
+ source.append(" unsigned int nnz = row_indices[size]; \n");
+ source.append(" unsigned int row_at_window_start = size; \n");
+ source.append(" unsigned int row_at_window_end; \n");
+ source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+ source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int i = (nnz - i2) - 1; \n");
+ source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n");
+ source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n");
+ source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (i2 < nnz) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_index_dec = 0; \n");
+ source.append(" while (row_index_lookahead[row_index_dec] > i) \n");
+ source.append(" ++row_index_dec; \n");
+ source.append(" row_index = row_at_window_start - row_index_dec; \n");
+ source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" row_index = size+1; \n");
+ source.append(" row_index_buffer[get_local_id(0)] = 0; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" row_at_window_start = row_index_buffer[0]; \n");
+ source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
+
+ //backward elimination
+ source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row = row_at_window_start - row2; \n");
+ source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+ source.append(" if ( (row_index == row) && (col_index < row) ) \n");
+ source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ source.append(" row_at_window_start = row_at_window_end; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+ // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void trans_unit_lu_forward( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" __local unsigned int row_index_lookahead[256]; \n");
+ source.append(" __local unsigned int row_index_buffer[256]; \n");
+
+ source.append(" unsigned int row_index; \n");
+ source.append(" unsigned int col_index; \n");
+ source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n");
+ source.append(" unsigned int nnz = row_indices[size]; \n");
+ source.append(" unsigned int row_at_window_start = 0; \n");
+ source.append(" unsigned int row_at_window_end = 0; \n");
+ source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+ source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+ source.append(" { \n");
+ source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n");
+ source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n");
+ source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (i < nnz) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_index_inc = 0; \n");
+ source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n");
+ source.append(" ++row_index_inc; \n");
+ source.append(" row_index = row_at_window_start + row_index_inc; \n");
+ source.append(" row_index_buffer[get_local_id(0)] = row_index; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" row_index = size+1; \n");
+ source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" row_at_window_start = row_index_buffer[0]; \n");
+ source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n");
+
+ //forward elimination
+ source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+ source.append(" if ( (row_index == row) && (col_index > row) ) \n");
+ source.append(" vector[col_index] -= result_entry * matrix_entry; \n");
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } \n");
+
+ source.append(" row_at_window_start = row_at_window_end; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT & source, std::string const & numeric_string)
+{
+
+ // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void trans_unit_lu_forward_slow( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int row = 0; row < size; ++row) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+ source.append(" unsigned int row_start = row_indices[row]; \n");
+ source.append(" unsigned int row_stop = row_indices[row + 1]; \n");
+ source.append(" for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int col_index = column_indices[entry_index]; \n");
+ source.append(" if (col_index > row) \n");
+ source.append(" vector[col_index] -= result_entry * elements[entry_index]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_unit_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+ // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void unit_lu_backward( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" __local unsigned int col_index_buffer[128]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+ source.append(" unsigned int nnz = row_indices[size]; \n");
+ source.append(" unsigned int current_row = size-1; \n");
+ source.append(" unsigned int row_at_window_start = size-1; \n");
+ source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
+ source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
+ source.append(" unsigned int next_row = row_indices[size-1]; \n");
+
+ source.append(" unsigned int i = loop_end + get_local_id(0); \n");
+ source.append(" while (1) \n");
+ source.append(" { \n");
+ //load into shared memory (coalesced access):
+ source.append(" if (i < nnz) \n");
+ source.append(" { \n");
+ source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
+ source.append(" unsigned int tmp = column_indices[i]; \n");
+ source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
+ source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ //now a single thread does the remaining work in shared memory:
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" { \n");
+ // traverse through all the loaded data from back to front:
+ source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
+ source.append(" { \n");
+ source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n");
+
+ source.append(" if (i+k >= nnz) \n");
+ source.append(" continue; \n");
+
+ source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
+ source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+ source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data
+ source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+
+ source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
+ source.append(" { \n");
+ source.append(" vector[current_row] = current_vector_entry; \n");
+ source.append(" if (current_row > 0) \n"); //load next row's data
+ source.append(" { \n");
+ source.append(" --current_row; \n");
+ source.append(" next_row = row_indices[current_row]; \n");
+ source.append(" current_vector_entry = vector[current_row]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+
+ source.append(" } \n"); // for k
+
+ source.append(" row_at_window_start = current_row; \n");
+ source.append(" } \n"); // if (get_local_id(0) == 0)
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+ source.append(" if (i < get_local_size(0)) \n");
+ source.append(" break; \n");
+
+ source.append(" i -= get_local_size(0); \n");
+ source.append(" } \n"); //for i
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+ // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+ source.append("__kernel void unit_lu_forward( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" __local unsigned int col_index_buffer[128]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+ source.append(" unsigned int nnz = row_indices[size]; \n");
+ source.append(" unsigned int current_row = 0; \n");
+ source.append(" unsigned int row_at_window_start = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
+ source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
+ source.append(" unsigned int next_row = row_indices[1]; \n");
+
+ source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+ source.append(" { \n");
+ //load into shared memory (coalesced access):
+ source.append(" if (i < nnz) \n");
+ source.append(" { \n");
+ source.append(" element_buffer[get_local_id(0)] = elements[i]; \n");
+ source.append(" unsigned int tmp = column_indices[i]; \n");
+ source.append(" col_index_buffer[get_local_id(0)] = tmp; \n");
+ source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ //now a single thread does the remaining work in shared memory:
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" { \n");
+ // traverse through all the loaded data:
+ source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n");
+ source.append(" { \n");
+ source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result
+ source.append(" { \n");
+ source.append(" vector[current_row] = current_vector_entry; \n");
+ source.append(" ++current_row; \n");
+ source.append(" if (current_row < size) //load next row's data \n");
+ source.append(" { \n");
+ source.append(" next_row = row_indices[current_row+1]; \n");
+ source.append(" current_vector_entry = vector[current_row]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
+ source.append(" { \n");
+ source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
+ source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+ source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data
+ source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+ source.append(" } \n");
+
+ source.append(" } \n"); // for k
+
+ source.append(" row_at_window_start = current_row; \n");
+ source.append(" } \n"); // if (get_local_id(0) == 0)
+
+ source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n");
+ source.append(" } //for i \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul_nvidia(StringT & source, std::string const & numeric_string, unsigned int subwarp_size, bool with_alpha_beta)
+{
+ std::stringstream ss;
+ ss << subwarp_size;
+
+ if (with_alpha_beta)
+ source.append("__kernel void vec_mul_nvidia_alpha_beta( \n");
+ else
+ source.append("__kernel void vec_mul_nvidia( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const unsigned int * row_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int num_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result \n");
+ if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
+ source.append(") { \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
+
+ source.append(" const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
+ source.append(" const unsigned int block_increment = get_local_size(0) * ((layout_result.z - 1) / (get_global_size(0)) + 1); \n");
+ source.append(" const unsigned int block_start = get_group_id(0) * block_increment; \n");
+ source.append(" const unsigned int block_stop = min(block_start + block_increment, layout_result.z); \n");
+
+ source.append(" for (unsigned int row = block_start + get_local_id(0) / " + ss.str() + "; \n");
+ source.append(" row < block_stop; \n");
+ source.append(" row += get_local_size(0) / " + ss.str() + ") \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" unsigned int row_end = row_indices[row+1]; \n");
+ source.append(" for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
+ source.append(" dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+ source.append(" shared_elements[get_local_id(0)] = dot_prod; \n");
+ source.append(" #pragma unroll \n");
+ source.append(" for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
+ source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
+
+ source.append(" if (id_in_row == 0) \n");
+ if (with_alpha_beta)
+ source.append(" result[row * layout_result.y + layout_result.x] = alpha * shared_elements[get_local_id(0)] + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+ else
+ source.append(" result[row * layout_result.y + layout_result.x] = shared_elements[get_local_id(0)]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+ if (with_alpha_beta)
+ source.append("__kernel void vec_mul_alpha_beta( \n");
+ else
+ source.append("__kernel void vec_mul( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const unsigned int * row_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int num_blocks, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result \n");
+ if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
+ source.append(") { \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
+
+ source.append(" unsigned int row_start = row_blocks[get_group_id(0)]; \n");
+ source.append(" unsigned int row_stop = row_blocks[get_group_id(0) + 1]; \n");
+ source.append(" unsigned int rows_to_process = row_stop - row_start; \n");
+ source.append(" unsigned int element_start = row_indices[row_start]; \n");
+ source.append(" unsigned int element_stop = row_indices[row_stop]; \n");
+
+ source.append(" if (rows_to_process > 4) { \n"); // CSR stream
+ // load to shared buffer:
+ source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+ source.append(" shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // use one thread per row to sum:
+ source.append(" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+ source.append(" unsigned int thread_row_start = row_indices[row] - element_start; \n");
+ source.append(" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
+ source.append(" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
+ source.append(" dot_prod += shared_elements[i]; \n");
+ if (with_alpha_beta)
+ source.append(" result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+ else
+ source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ // use multiple threads for the summation
+ source.append(" else if (rows_to_process > 1) \n"); // CSR stream with local reduction
+ source.append(" {\n");
+ // load to shared buffer:
+ source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0))\n");
+ source.append(" shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x];\n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // sum each row separately using a reduction:
+ source.append(" for (unsigned int row = row_start; row < row_stop; ++row)\n");
+ source.append(" {\n");
+ source.append(" unsigned int current_row_start = row_indices[row] - element_start;\n");
+ source.append(" unsigned int current_row_stop = row_indices[row + 1] - element_start;\n");
+ source.append(" unsigned int thread_base_id = current_row_start + get_local_id(0);\n");
+
+ // sum whatever exceeds the current buffer:
+ source.append(" for (unsigned int j = thread_base_id + get_local_size(0); j < current_row_stop; j += get_local_size(0))\n");
+ source.append(" shared_elements[thread_base_id] += shared_elements[j];\n");
+
+ // reduction
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n");
+ source.append(" {\n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE);\n");
+ source.append(" if (get_local_id(0) < stride && thread_base_id < current_row_stop)\n");
+ source.append(" shared_elements[thread_base_id] += (thread_base_id + stride < current_row_stop) ? shared_elements[thread_base_id+stride] : 0;\n");
+ source.append(" }\n");
+ source.append(" "); source.append(numeric_string); source.append(" row_result = 0; \n");
+ source.append(" if (current_row_stop > current_row_start)\n");
+ source.append(" row_result = shared_elements[current_row_start]; \n");
+ source.append(" if (get_local_id(0) == 0)\n");
+ if (with_alpha_beta)
+ source.append(" result[row * layout_result.y + layout_result.x] = alpha * row_result + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0);\n");
+ else
+ source.append(" result[row * layout_result.y + layout_result.x] = row_result;\n");
+ source.append(" }\n");
+ source.append(" }\n");
+
+
+ source.append(" else \n"); // CSR vector for a single row
+ source.append(" { \n");
+ // load and sum to shared buffer:
+ source.append(" shared_elements[get_local_id(0)] = 0; \n");
+ source.append(" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+ source.append(" shared_elements[get_local_id(0)] += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+ // reduction to obtain final result
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
+ source.append(" } \n");
+
+ source.append(" if (get_local_id(0) == 0) \n");
+ if (with_alpha_beta)
+ source.append(" result[row_start * layout_result.y + layout_result.x] = alpha * shared_elements[0] + ((beta != 0) ? beta * result[row_start * layout_result.y + layout_result.x] : 0); \n");
+ else
+ source.append(" result[row_start * layout_result.y + layout_result.x] = shared_elements[0]; \n");
+ source.append(" } \n");
+
+ source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul4(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+ if (with_alpha_beta)
+ source.append("__kernel void vec_mul4_alpha_beta( \n");
+ else
+ source.append("__kernel void vec_mul4( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const uint4 * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append("4 * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result \n");
+ if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
+ source.append(") { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n");
+ source.append(" unsigned int start, next_stop; \n");
+ source.append(" uint4 col_idx; \n");
+ source.append(" "); source.append(numeric_string); source.append("4 tmp_vec; \n");
+ source.append(" "); source.append(numeric_string); source.append("4 tmp_entries; \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" dot_prod = 0; \n");
+ source.append(" start = row_indices[row] / 4; \n");
+ source.append(" next_stop = row_indices[row+1] / 4; \n");
+
+ source.append(" for (unsigned int i = start; i < next_stop; ++i) \n");
+ source.append(" { \n");
+ source.append(" col_idx = column_indices[i]; \n");
+
+ source.append(" tmp_entries = elements[i]; \n");
+ source.append(" tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
+
+ source.append(" dot_prod += dot(tmp_entries, tmp_vec); \n");
+ source.append(" } \n");
+ if (with_alpha_beta)
+ source.append(" result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+ else
+ source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul8(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+ if (with_alpha_beta)
+ source.append("__kernel void vec_mul8_alpha_beta( \n");
+ else
+ source.append("__kernel void vec_mul8( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const uint8 * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append("8 * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result \n");
+ if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
+ source.append(") { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n");
+ source.append(" unsigned int start, next_stop; \n");
+ source.append(" uint8 col_idx; \n");
+ source.append(" "); source.append(numeric_string); source.append("8 tmp_vec; \n");
+ source.append(" "); source.append(numeric_string); source.append("8 tmp_entries; \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" dot_prod = 0; \n");
+ source.append(" start = row_indices[row] / 8; \n");
+ source.append(" next_stop = row_indices[row+1] / 8; \n");
+
+ source.append(" for (unsigned int i = start; i < next_stop; ++i) \n");
+ source.append(" { \n");
+ source.append(" col_idx = column_indices[i]; \n");
+
+ source.append(" tmp_entries = elements[i]; \n");
+ source.append(" tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
+ source.append(" tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
+
+ source.append(" dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
+ source.append(" dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
+ source.append(" } \n");
+ if (with_alpha_beta)
+ source.append(" result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+ else
+ source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul_cpu(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void vec_mul_cpu( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * vector, \n");
+ source.append(" "); source.append(numeric_string); source.append(" alpha, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" "); source.append(numeric_string); source.append(" beta, \n");
+ source.append(" unsigned int size) \n");
+ source.append("{ \n");
+ source.append(" unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
+ source.append(" unsigned int row_start = get_global_id(0) * work_per_item; \n");
+ source.append(" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
+ source.append(" for (unsigned int row = row_start; row < row_stop; ++row) \n");
+ source.append(" { \n");
+ source.append(" "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
+ source.append(" unsigned int row_end = row_indices[row+1]; \n");
+ source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+ source.append(" dot_prod += elements[i] * vector[column_indices[i]]; \n");
+ source.append(" result[row] = alpha * dot_prod + ((beta != 0) ? beta * result[row] : 0); \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+
+
+/** @brief OpenCL kernel for the first stage of sparse matrix-matrix multiplication.
+ *
+ * Each work group derives the maximum of nonzero entries encountered by row in A.
+ **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_1(StringT & source)
+{
+ source.append("__kernel void spgemm_stage1( \n");
+ source.append(" __global const unsigned int * A_row_indices, \n");
+ source.append(" __global const unsigned int * A_column_indices, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" __global unsigned int * group_nnz_array) \n");
+ source.append("{ \n");
+ source.append(" unsigned int work_per_item = max((uint) ((A_size1 - 1) / get_global_size(0) + 1), (uint) 1); \n");
+ source.append(" unsigned int row_start = get_global_id(0) * work_per_item; \n");
+ source.append(" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) A_size1); \n");
+ source.append(" unsigned int max_A_nnz = 0; \n");
+ source.append(" for (unsigned int row = row_start; row < row_stop; ++row) \n");
+ source.append(" max_A_nnz = max(max_A_nnz, A_row_indices[row + 1] - A_row_indices[row]); \n");
+
+ // load and sum to shared buffer:
+ source.append(" __local unsigned int shared_nnz[256]; \n");
+ source.append(" shared_nnz[get_local_id(0)] = max_A_nnz; \n");
+
+ // reduction to obtain final result
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) \n");
+ source.append(" shared_nnz[get_local_id(0)] = max(shared_nnz[get_local_id(0)], shared_nnz[get_local_id(0) + stride]); \n");
+ source.append(" } \n");
+
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" group_nnz_array[get_group_id(0)] = shared_nnz[0]; \n");
+ source.append("} \n");
+}
+
+
+/** @brief OpenCL kernel for decomposing A in C = A * B, such that A = A_2 * G_1 with G_1 containing at most 32 nonzeros per row
+ *
+ * Needed for the RMerge split stage.
+ **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_decompose_1(StringT & source)
+{
+ source.append("__kernel void spgemm_decompose_1( \n");
+ source.append(" __global const unsigned int * A_row_indices, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" unsigned int max_per_row, \n");
+ source.append(" __global unsigned int * chunks_per_row) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) {\n");
+ source.append(" unsigned int num_entries = A_row_indices[row+1] - A_row_indices[row]; \n");
+ source.append(" chunks_per_row[row] = (num_entries < max_per_row) ? 1 : ((num_entries - 1) / max_per_row + 1); \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+/** @brief OpenCL kernel for filling A_2 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonzeros per row
+ *
+ * Needed for the RMerge split stage.
+ **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_A2(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void spgemm_A2( \n");
+ source.append(" __global unsigned int *A2_row_indices, \n");
+ source.append(" __global unsigned int *A2_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *A2_elements, \n");
+ source.append(" unsigned int A2_size1, \n");
+ source.append(" __global const unsigned int *new_row_buffer) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < A2_size1; i += get_global_size(0)) {\n");
+ source.append(" unsigned int index_start = new_row_buffer[i]; \n");
+ source.append(" unsigned int index_stop = new_row_buffer[i+1]; \n");
+
+ source.append(" A2_row_indices[i] = index_start; \n");
+
+ source.append(" for (unsigned int j = index_start; j < index_stop; ++j) { \n");
+ source.append(" A2_col_indices[j] = j; \n");
+ source.append(" A2_elements[j] = 1; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" if (get_global_id(0) == 0) \n");
+ source.append(" A2_row_indices[A2_size1] = new_row_buffer[A2_size1]; \n");
+ source.append("} \n");
+}
+
+/** @brief OpenCL kernel for filling G_1 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonzeros per row
+ *
+ * Needed for the RMerge split stage.
+ **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_G1(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void spgemm_G1( \n");
+ source.append(" __global unsigned int *G1_row_indices, \n");
+ source.append(" __global unsigned int *G1_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *G1_elements, \n");
+ source.append(" unsigned int G1_size1, \n");
+ source.append(" __global const unsigned int *A_row_indices, \n");
+ source.append(" __global const unsigned int *A_col_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" *A_elements, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" unsigned int A_nnz, \n");
+ source.append(" unsigned int max_per_row, \n");
+ source.append(" __global const unsigned int *new_row_buffer) \n");
+ source.append("{ \n");
+
+ // Part 1: Copy column indices and entries:
+ source.append(" for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) {\n");
+ source.append(" G1_col_indices[i] = A_col_indices[i]; \n");
+ source.append(" G1_elements[i] = A_elements[i]; \n");
+ source.append(" } \n");
+
+ // Part 2: Derive new row indices:
+ source.append(" for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) {\n");
+ source.append(" unsigned int old_start = A_row_indices[i]; \n");
+ source.append(" unsigned int new_start = new_row_buffer[i]; \n");
+ source.append(" unsigned int row_chunks = new_row_buffer[i+1] - new_start; \n");
+
+ source.append(" for (unsigned int j=0; j<row_chunks; ++j) \n");
+ source.append(" G1_row_indices[new_start + j] = old_start + j * max_per_row; \n");
+ source.append(" } \n");
+
+ // write last entry in row_buffer with thread 0:
+ source.append(" if (get_global_id(0) == 0) \n");
+ source.append(" G1_row_indices[G1_size1] = A_row_indices[A_size1]; \n");
+ source.append("} \n");
+}
+
+
+
+/** @brief OpenCL kernel for the second stage of sparse matrix-matrix multiplication.
+ *
+ * Computes the exact sparsity pattern of A*B.
+ * Result array C_row_indices contains number of nonzeros in each row.
+ **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_2(StringT & source)
+{
+ source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
+ source.append("__kernel void spgemm_stage2( \n");
+ source.append(" __global const unsigned int * A_row_indices, \n");
+ source.append(" __global const unsigned int * A_col_indices, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" __global const unsigned int * B_row_indices, \n");
+ source.append(" __global const unsigned int * B_col_indices, \n");
+ source.append(" unsigned int B_size2, \n");
+ source.append(" __global unsigned int * C_row_indices) \n");
+ source.append("{ \n");
+ source.append(" unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
+ source.append(" unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
+ source.append(" unsigned int row_C_stop = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
+ source.append(" __local unsigned int shared_front[32]; \n");
+
+ source.append(" for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_A_start = A_row_indices[row_C]; \n");
+ source.append(" unsigned int row_A_end = A_row_indices[row_C+1]; \n");
+
+ source.append(" unsigned int my_row_B = row_A_start + get_local_id(0); \n");
+ source.append(" unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
+ source.append(" unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
+ source.append(" unsigned int row_B_end = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
+
+ source.append(" unsigned int num_nnz = 0; \n");
+ source.append(" if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
+
+ source.append(" unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+ source.append(" while (1) { \n");
+
+ // determine minimum index via reduction:
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_front[get_local_id(0)] = current_front_index; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < 16) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 16]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < 8) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 8]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < 4) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 4]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < 2) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 2]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < 1) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 1]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (shared_front[0] == B_size2) break; \n");
+
+ // update front:
+ source.append(" if (current_front_index == shared_front[0]) { \n");
+ source.append(" ++row_B_start; \n");
+ source.append(" current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+ source.append(" } \n");
+
+ source.append(" ++num_nnz; \n");
+ source.append(" } \n");
+ source.append(" } else { num_nnz = row_B_end - row_B_start; }\n");
+
+ // write number of entries found:
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" C_row_indices[row_C] = num_nnz; \n");
+
+ source.append(" } \n");
+
+ source.append("} \n");
+
+}
+
+
+/** @brief OpenCL kernel for the third stage of sparse matrix-matrix multiplication.
+ *
+ * Computes A*B into C with known sparsity pattern (obtained from stages 1 and 2).
+ **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_3(StringT & source, std::string const & numeric_string)
+{
+ source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
+ source.append("__kernel void spgemm_stage3( \n");
+ source.append(" __global const unsigned int * A_row_indices, \n");
+ source.append(" __global const unsigned int * A_col_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * A_elements, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" __global const unsigned int * B_row_indices, \n");
+ source.append(" __global const unsigned int * B_col_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * B_elements, \n");
+ source.append(" unsigned int B_size2, \n");
+ source.append(" __global unsigned int * C_row_indices, \n");
+ source.append(" __global unsigned int * C_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * C_elements) \n");
+ source.append("{ \n");
+ source.append(" unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
+ source.append(" unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
+ source.append(" unsigned int row_C_stop = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
+ source.append(" __local unsigned int shared_front[32]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_front_values[32]; \n");
+ source.append(" unsigned int local_id = get_local_id(0); \n");
+
+ source.append(" for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_A_start = A_row_indices[row_C]; \n");
+ source.append(" unsigned int row_A_end = A_row_indices[row_C+1]; \n");
+
+ source.append(" unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? local_id : 0); \n"); // single row is a special case
+ source.append(" unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
+ source.append(" unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
+ source.append(" unsigned int row_B_end = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; \n");
+ source.append(" unsigned int index_in_C = C_row_indices[row_C] + local_id; \n");
+
+ source.append(" if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
+
+ source.append(" unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+ source.append(" "); source.append(numeric_string); source.append(" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start] : 0; \n");
+
+ source.append(" unsigned int index_buffer = 0; \n");
+ source.append(" "); source.append(numeric_string); source.append(" value_buffer = 0; \n");
+ source.append(" unsigned int buffer_size = 0; \n");
+
+ source.append(" while (1) { \n");
+
+ // determine minimum index via reduction:
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_front[local_id] = current_front_index; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 16) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 16]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 8) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 8]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 4) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 4]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 2) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 2]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 1) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 1]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" if (shared_front[0] == B_size2) break; \n");
+
+ // compute output value via reduction:
+ source.append(" shared_front_values[local_id] = (current_front_index == shared_front[0]) ? val_A * current_front_value : 0; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 16) shared_front_values[local_id] += shared_front_values[local_id + 16]; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 8) shared_front_values[local_id] += shared_front_values[local_id + 8]; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 4) shared_front_values[local_id] += shared_front_values[local_id + 4]; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 2) shared_front_values[local_id] += shared_front_values[local_id + 2]; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (local_id < 1) shared_front_values[local_id] += shared_front_values[local_id + 1]; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // update front:
+ source.append(" if (current_front_index == shared_front[0]) { \n");
+ source.append(" ++row_B_start; \n");
+ source.append(" current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+ source.append(" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start] : 0; \n");
+ source.append(" } \n");
+
+ // write current front to register buffer:
+ source.append(" index_buffer = (local_id == buffer_size) ? shared_front[0] : index_buffer; \n");
+ source.append(" value_buffer = (local_id == buffer_size) ? shared_front_values[0] : value_buffer; \n");
+ source.append(" ++buffer_size; \n");
+
+ // flush register buffer via a coalesced write once full:
+ source.append(" if (buffer_size == get_local_size(0)) { \n");
+ source.append(" C_col_indices[index_in_C] = index_buffer; \n");
+ source.append(" C_elements[index_in_C] = value_buffer; \n");
+ source.append(" } \n");
+
+ // the following should be in the previous if-conditional, but a bug in NVIDIA drivers 34x.yz requires this slight rewrite
+ source.append(" index_in_C += (buffer_size == get_local_size(0)) ? get_local_size(0) : 0; \n");
+ source.append(" buffer_size = (buffer_size == get_local_size(0)) ? 0 : buffer_size; \n");
+
+ source.append(" } \n");
+
+ // write remaining entries in register buffer:
+ source.append(" if (local_id < buffer_size) { \n");
+ source.append(" C_col_indices[index_in_C] = index_buffer; \n");
+ source.append(" C_elements[index_in_C] = value_buffer; \n");
+ source.append(" } \n");
+
+ // copy to C in coalesced manner:
+ source.append(" } else { \n");
+ source.append(" for (unsigned int i = row_B_start + local_id; i < row_B_end; i += get_local_size(0)) { \n");
+ source.append(" C_col_indices[index_in_C] = B_col_indices[i]; \n");
+ source.append(" C_elements[index_in_C] = val_A * B_elements[i]; \n");
+ source.append(" index_in_C += get_local_size(0); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+
+ source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod(StringT & source, std::string const & numeric_string)
+{
+ generate_compressed_matrix_compressed_matrix_prod_1(source);
+ generate_compressed_matrix_compressed_matrix_prod_decompose_1(source);
+ generate_compressed_matrix_compressed_matrix_prod_A2(source, numeric_string);
+ generate_compressed_matrix_compressed_matrix_prod_G1(source, numeric_string);
+ generate_compressed_matrix_compressed_matrix_prod_2(source);
+ generate_compressed_matrix_compressed_matrix_prod_3(source, numeric_string);
+}
+
+template<typename StringT>
+void generate_compressed_matrix_assign_to_dense(StringT & source, std::string const & numeric_string)
+{
+
+ source.append(" __kernel void assign_to_dense( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * B, \n");
+ source.append(" unsigned int B_row_start, \n");
+ source.append(" unsigned int B_col_start, \n");
+ source.append(" unsigned int B_row_inc, \n");
+ source.append(" unsigned int B_col_inc, \n");
+ source.append(" unsigned int B_row_size, \n");
+ source.append(" unsigned int B_col_size, \n");
+ source.append(" unsigned int B_internal_rows, \n");
+ source.append(" unsigned int B_internal_cols) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < B_row_size; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_end = row_indices[i+1]; \n");
+ source.append(" for (unsigned int j = row_indices[i]; j<row_end; j++) \n");
+ source.append(" { \n");
+ source.append(" B[(B_row_start + i * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+}
+
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for compressed_matrix (except solvers). */
+template<typename NumericT>
+struct compressed_matrix
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ if (numeric_string == "float" || numeric_string == "double")
+ {
+ generate_compressed_matrix_jacobi(source, numeric_string);
+ }
+ generate_compressed_matrix_dense_matrix_multiplication(source, numeric_string);
+ generate_compressed_matrix_row_info_extractor(source, numeric_string);
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16, true);
+ generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16, false);
+ }
+ gene
<TRUNCATED>
[13/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp
new file mode 100644
index 0000000..b350fe0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp
@@ -0,0 +1,945 @@
+#ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/iterative_operations.hpp
+ @brief Implementations of specialized kernels for fast iterative solvers using OpenCL
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/detail/vector_def.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/iterative.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+ NumericT alpha,
+ vector_base<NumericT> & p,
+ vector_base<NumericT> & r,
+ vector_base<NumericT> const & Ap,
+ NumericT beta,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_vector_update");
+ cl_uint vec_size = cl_uint(viennacl::traits::size(result));
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ viennacl::ocl::enqueue(k(result, alpha, p, r, Ap, beta, inner_prod_buffer, vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(compressed_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "cg_csr_blocked_prod" : "cg_csr_prod");
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ if (use_nvidia_blocked)
+ {
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ p,
+ Ap,
+ vec_size,
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ ));
+ }
+ else
+ {
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+ p,
+ Ap,
+ vec_size,
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(1024 * sizeof(NumericT))
+ ));
+ }
+
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(coordinate_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+ Ap.clear();
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_coo_prod");
+ unsigned int thread_num = 256; //k.local_work_size(0);
+
+ k.local_work_size(0, thread_num);
+
+ k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+ viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+ p,
+ Ap,
+ vec_size,
+ viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+ viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ ));
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_ell_prod");
+
+ unsigned int thread_num = 128;
+ unsigned int group_num = 256;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.maxnnz()),
+ cl_uint(A.internal_maxnnz()),
+ viennacl::traits::opencl_handle(p),
+ viennacl::traits::opencl_handle(Ap),
+ vec_size,
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ )
+ );
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(sliced_ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_sliced_ell_prod");
+
+ vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
+ unsigned int group_num = 256;
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ thread_num = 256;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+ A.handle2().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(p),
+ viennacl::traits::opencl_handle(Ap),
+ vec_size,
+ cl_uint(A.rows_per_block()),
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ )
+ );
+}
+
+
+template<typename NumericT>
+void pipelined_cg_prod(hyb_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_hyb_prod");
+
+ unsigned int thread_num = 128;
+ unsigned int group_num = 128;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle4().opencl_handle(),
+ A.handle5().opencl_handle(),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.ell_nnz()),
+ cl_uint(A.internal_ellnnz()),
+ viennacl::traits::opencl_handle(p),
+ viennacl::traits::opencl_handle(Ap),
+ vec_size,
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ )
+ );
+}
+
+
+//////////////////////////// BiCGStab ////////////////////////
+
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+ vector_base<NumericT> & r,
+ vector_base<NumericT> const & Ap,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_update_s");
+ cl_uint vec_size = cl_uint(viennacl::traits::size(s));
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+ viennacl::ocl::enqueue(k(s, r, Ap,
+ inner_prod_buffer, chunk_size, chunk_offset, vec_size,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+ vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+ NumericT beta, vector_base<NumericT> const & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
+{
+ (void)buffer_chunk_size;
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_vector_update");
+ cl_uint vec_size = cl_uint(viennacl::traits::size(result));
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ viennacl::ocl::enqueue(k(result, alpha, p, omega, s,
+ residual, As,
+ beta, Ap,
+ r0star,
+ inner_prod_buffer,
+ vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ )
+ );
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "bicgstab_csr_blocked_prod" : "bicgstab_csr_prod");
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ if (use_nvidia_blocked)
+ {
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ p,
+ Ap,
+ r0star,
+ vec_size,
+ inner_prod_buffer, chunk_size, chunk_offset,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ ));
+ }
+ else
+ {
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+ p,
+ Ap,
+ r0star,
+ vec_size,
+ inner_prod_buffer, chunk_size, chunk_offset,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ ));
+ }
+
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+ Ap.clear();
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_coo_prod");
+ unsigned int thread_num = 256; //k.local_work_size(0);
+
+ k.local_work_size(0, thread_num);
+
+ k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+ viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+ p,
+ Ap,
+ r0star,
+ vec_size,
+ viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+ viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
+ inner_prod_buffer, chunk_size, chunk_offset,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ ));
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_ell_prod");
+
+ unsigned int thread_num = 128;
+ unsigned int group_num = 128;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.maxnnz()),
+ cl_uint(A.internal_maxnnz()),
+ viennacl::traits::opencl_handle(p),
+ viennacl::traits::opencl_handle(Ap),
+ r0star,
+ vec_size,
+ inner_prod_buffer, chunk_size, chunk_offset,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ )
+ );
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_sliced_ell_prod");
+
+ vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
+ unsigned int group_num = 256;
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ thread_num = 256;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+ A.handle2().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(p),
+ viennacl::traits::opencl_handle(Ap),
+ r0star,
+ vec_size,
+ cl_uint(A.rows_per_block()),
+ inner_prod_buffer, chunk_size, chunk_offset,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ )
+ );
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_hyb_prod");
+
+ unsigned int thread_num = 256;
+ unsigned int group_num = 128;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*256);
+ }
+
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle4().opencl_handle(),
+ A.handle5().opencl_handle(),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.ell_nnz()),
+ cl_uint(A.internal_ellnnz()),
+ viennacl::traits::opencl_handle(p),
+ viennacl::traits::opencl_handle(Ap),
+ r0star,
+ vec_size,
+ inner_prod_buffer, chunk_size, chunk_offset,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
+ )
+ );
+}
+
+///////////////////////////////////
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+ *
+ * This routines computes for vectors 'r', 'v_k':
+ * Second reduction step for ||v_k||
+ * v_k /= ||v_k||
+ * First reduction step for <r, v_k>
+ */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+ vector_base<T> const & residual,
+ vector_base<T> & R_buffer,
+ vcl_size_t offset_in_R,
+ vector_base<T> const & inner_prod_buffer,
+ vector_base<T> & r_dot_vk_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(v_k).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_normalize_vk");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ cl_uint size_vk = cl_uint(v_k.size());
+ cl_uint vk_offset = cl_uint(viennacl::traits::start(v_k));
+ cl_uint R_offset = cl_uint(offset_in_R);
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
+ viennacl::ocl::enqueue(k(v_k, vk_offset,
+ residual,
+ R_buffer, R_offset,
+ inner_prod_buffer, chunk_size,
+ r_dot_vk_buffer, chunk_offset,
+ size_vk,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+ ));
+}
+
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vcl_size_t param_k,
+ vector_base<T> & vi_in_vk_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_gram_schmidt_1");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ cl_uint size_vk = cl_uint(v_k_size);
+ cl_uint internal_size_vk = cl_uint(v_k_internal_size);
+ cl_uint ocl_k = cl_uint(param_k);
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
+ vi_in_vk_buffer, chunk_size
+ ));
+}
+
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vcl_size_t param_k,
+ vector_base<T> const & vi_in_vk_buffer,
+ vector_base<T> & R_buffer,
+ vcl_size_t krylov_dim,
+ vector_base<T> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_gram_schmidt_2");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ cl_uint size_vk = cl_uint(v_k_size);
+ cl_uint internal_size_vk = cl_uint(v_k_internal_size);
+ cl_uint ocl_k = cl_uint(param_k);
+ cl_uint chunk_size = cl_uint(buffer_chunk_size);
+ cl_uint ocl_krylov_dim = cl_uint(krylov_dim);
+ viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
+ vi_in_vk_buffer, chunk_size,
+ R_buffer, ocl_krylov_dim,
+ inner_prod_buffer,
+ viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T))
+ ));
+}
+
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+ vector_base<T> const & residual,
+ vector_base<T> const & krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vector_base<T> const & coefficients,
+ vcl_size_t param_k)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_update_result");
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ cl_uint size_vk = cl_uint(v_k_size);
+ cl_uint internal_size_vk = cl_uint(v_k_internal_size);
+ cl_uint ocl_k = cl_uint(param_k);
+ viennacl::ocl::enqueue(k(result,
+ residual,
+ krylov_basis, size_vk, internal_size_vk,
+ coefficients, ocl_k
+ ));
+}
+
+
+template <typename T>
+void pipelined_gmres_prod(compressed_matrix<T> const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), use_nvidia_blocked ? "gmres_csr_blocked_prod" : "gmres_csr_prod");
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+ cl_uint start_p = cl_uint(viennacl::traits::start(p));
+ cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
+
+ k.local_work_size(0, 128);
+ k.global_work_size(0, 128*128);
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ {
+ k.local_work_size(0, 256);
+ k.global_work_size(0, 256*128);
+ }
+
+ if (use_nvidia_blocked)
+ {
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ p, start_p,
+ Ap, start_Ap,
+ vec_size,
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+ ));
+ }
+ else
+ {
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
+ p, start_p,
+ Ap, start_Ap,
+ vec_size,
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+ viennacl::ocl::local_mem(1024 * sizeof(T))
+ ));
+ }
+}
+
+template <typename T>
+void pipelined_gmres_prod(coordinate_matrix<T> const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+ cl_uint start_p = cl_uint(viennacl::traits::start(p));
+ cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
+
+ Ap.clear();
+ inner_prod_buffer.clear();
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_coo_prod");
+ unsigned int thread_num = 128; //k.local_work_size(0);
+
+ k.local_work_size(0, thread_num);
+
+ k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
+
+ viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
+ p, start_p,
+ Ap, start_Ap,
+ vec_size,
+ viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
+ viennacl::ocl::local_mem(sizeof(T)*thread_num),
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+ ));
+}
+
+template <typename T>
+void pipelined_gmres_prod(ell_matrix<T> const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+ cl_uint start_p = cl_uint(viennacl::traits::start(p));
+ cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_ell_prod");
+
+ unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
+ unsigned int group_num = 128;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.maxnnz()),
+ cl_uint(A.internal_maxnnz()),
+ viennacl::traits::opencl_handle(p), start_p,
+ viennacl::traits::opencl_handle(Ap), start_Ap,
+ vec_size,
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+ )
+ );
+}
+
+template <typename T>
+void pipelined_gmres_prod(sliced_ell_matrix<T> const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+ cl_uint start_p = cl_uint(viennacl::traits::start(p));
+ cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_sliced_ell_prod");
+
+ vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128));
+ unsigned int group_num = 128;
+
+ if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+ thread_num = 256;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
+ A.handle2().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(p), start_p,
+ viennacl::traits::opencl_handle(Ap), start_Ap,
+ vec_size,
+ cl_uint(A.rows_per_block()),
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+ )
+ );
+}
+
+
+template <typename T>
+void pipelined_gmres_prod(hyb_matrix<T> const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::iterative<T>::init(ctx);
+
+ cl_uint vec_size = cl_uint(viennacl::traits::size(p));
+ cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
+ cl_uint start_p = cl_uint(viennacl::traits::start(p));
+ cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_hyb_prod");
+
+ unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
+ unsigned int group_num = 128;
+
+ k.local_work_size(0, thread_num);
+ k.global_work_size(0, thread_num * group_num);
+
+
+ viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ A.handle3().opencl_handle(),
+ A.handle4().opencl_handle(),
+ A.handle5().opencl_handle(),
+ cl_uint(A.internal_size1()),
+ cl_uint(A.ell_nnz()),
+ cl_uint(A.internal_ellnnz()),
+ viennacl::traits::opencl_handle(p), start_p,
+ viennacl::traits::opencl_handle(Ap), start_Ap,
+ vec_size,
+ inner_prod_buffer,
+ buffer_size_per_vector,
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
+ viennacl::ocl::local_mem(k.local_work_size() * sizeof(T))
+ )
+ );
+}
+
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp
new file mode 100644
index 0000000..b0252d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp
@@ -0,0 +1,393 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_AMG_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_AMG_HPP
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/amg.hpp
+ * @brief OpenCL kernel file for operations related to algebraic multigrid */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+
+template<typename StringT>
+void generate_amg_influence_trivial(StringT & source)
+{
+
+ source.append("__kernel void amg_influence_trivial( \n");
+ source.append(" __global const unsigned int * A_row_indices, \n");
+ source.append(" __global const unsigned int * A_col_indices, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" unsigned int A_nnz, \n");
+ source.append(" __global unsigned int * influences_row, \n");
+ source.append(" __global unsigned int * influences_id, \n");
+ source.append(" __global unsigned int * influences_values) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int tmp = A_row_indices[i]; \n");
+ source.append(" influences_row[i] = tmp; \n");
+ source.append(" influences_values[i] = A_row_indices[i+1] - tmp; \n");
+ source.append(" } \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) \n");
+ source.append(" influences_id[i] = A_col_indices[i]; \n");
+
+ source.append(" if (get_global_id(0) == 0) \n");
+ source.append(" influences_row[A_size1] = A_row_indices[A_size1]; \n");
+ source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_amg_pmis2_init_workdata(StringT & source)
+{
+
+ source.append("__kernel void amg_pmis2_init_workdata( \n");
+ source.append(" __global unsigned int *work_state, \n");
+ source.append(" __global unsigned int *work_random, \n");
+ source.append(" __global unsigned int *work_index, \n");
+ source.append(" __global unsigned int const *point_types, \n");
+ source.append(" __global unsigned int const *random_weights, \n");
+ source.append(" unsigned int size) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" switch (point_types[i]) { \n");
+ source.append(" case 0: work_state[i] = 1; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
+ source.append(" case 1: work_state[i] = 2; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE
+ source.append(" case 2: work_state[i] = 0; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE
+
+ source.append(" default: break; // do nothing \n");
+ source.append(" } \n");
+
+ source.append(" work_random[i] = random_weights[i]; \n");
+ source.append(" work_index[i] = i; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+
+template<typename StringT>
+void generate_amg_pmis2_max_neighborhood(StringT & source)
+{
+
+ source.append("__kernel void amg_pmis2_max_neighborhood( \n");
+ source.append(" __global unsigned int *work_state, \n");
+ source.append(" __global unsigned int *work_random, \n");
+ source.append(" __global unsigned int *work_index, \n");
+ source.append(" __global unsigned int *work_state2, \n");
+ source.append(" __global unsigned int *work_random2, \n");
+ source.append(" __global unsigned int *work_index2, \n");
+ source.append(" __global unsigned int const *influences_row, \n");
+ source.append(" __global unsigned int const *influences_id, \n");
+ source.append(" unsigned int size) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+
+ // load
+ source.append(" unsigned int state = work_state[i]; \n");
+ source.append(" unsigned int random = work_random[i]; \n");
+ source.append(" unsigned int index = work_index[i]; \n");
+
+ // max
+ source.append(" unsigned int j_stop = influences_row[i + 1]; \n");
+ source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n");
+ source.append(" unsigned int influenced_point_id = influences_id[j]; \n");
+
+ // lexigraphical triple-max (not particularly pretty, but does the job):
+ source.append(" if (state < work_state[influenced_point_id]) { \n");
+ source.append(" state = work_state[influenced_point_id]; \n");
+ source.append(" random = work_random[influenced_point_id]; \n");
+ source.append(" index = work_index[influenced_point_id]; \n");
+ source.append(" } else if (state == work_state[influenced_point_id]) { \n");
+ source.append(" if (random < work_random[influenced_point_id]) { \n");
+ source.append(" state = work_state[influenced_point_id]; \n");
+ source.append(" random = work_random[influenced_point_id]; \n");
+ source.append(" index = work_index[influenced_point_id]; \n");
+ source.append(" } else if (random == work_random[influenced_point_id]) { \n");
+ source.append(" if (index < work_index[influenced_point_id]) { \n");
+ source.append(" state = work_state[influenced_point_id]; \n");
+ source.append(" random = work_random[influenced_point_id]; \n");
+ source.append(" index = work_index[influenced_point_id]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" }\n"); //for
+
+ // store
+ source.append(" work_state2[i] = state; \n");
+ source.append(" work_random2[i] = random; \n");
+ source.append(" work_index2[i] = index; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+
+template<typename StringT>
+void generate_amg_pmis2_mark_mis_nodes(StringT & source)
+{
+
+ source.append("__kernel void amg_pmis2_mark_mis_nodes( \n");
+ source.append(" __global unsigned int const *work_state, \n");
+ source.append(" __global unsigned int const *work_index, \n");
+ source.append(" __global unsigned int *point_types, \n");
+ source.append(" __global unsigned int *undecided_buffer, \n");
+ source.append(" unsigned int size) { \n");
+
+ source.append(" unsigned int num_undecided = 0; \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" unsigned int max_state = work_state[i]; \n");
+ source.append(" unsigned int max_index = work_index[i]; \n");
+
+ source.append(" if (point_types[i] == 0) { \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
+ source.append(" if (i == max_index) point_types[i] = 1; \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE
+ source.append(" else if (max_state == 2) point_types[i] = 2; \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE
+ source.append(" else num_undecided += 1; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ // reduction in shared memory:
+ source.append(" __local unsigned int shared_buffer[256]; \n");
+ source.append(" shared_buffer[get_local_id(0)] = num_undecided; \n");
+ source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) < stride) shared_buffer[get_local_id(0)] += shared_buffer[get_local_id(0)+stride]; \n");
+ source.append(" } \n");
+
+ source.append(" if (get_local_id(0) == 0) \n");
+ source.append(" undecided_buffer[get_group_id(0)] = shared_buffer[0]; \n");
+
+ source.append("} \n");
+}
+
+
+template<typename StringT>
+void generate_amg_pmis2_reset_state(StringT & source)
+{
+
+ source.append("__kernel void amg_pmis2_reset_state( \n");
+ source.append(" __global unsigned int *point_types, \n");
+ source.append(" unsigned int size) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
+ source.append(" if (point_types[i] != 1) point_types[i] = 0;\n"); // mind mapping of POINT_TYPE_COARSE and POINT_TYPE_UNDECIDED
+ source.append(" } \n");
+
+ source.append("} \n");
+}
+
+
+
+//////////////
+
+
+
+template<typename StringT>
+void generate_amg_agg_propagate_coarse_indices(StringT & source)
+{
+
+ source.append(" __kernel void amg_agg_propagate_coarse_indices( \n");
+ source.append(" __global unsigned int *point_types, \n");
+ source.append(" __global unsigned int *coarse_ids, \n");
+ source.append(" __global unsigned int const *influences_row, \n");
+ source.append(" __global unsigned int const *influences_id, \n");
+ source.append(" unsigned int size) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" if (point_types[i] == 1) { \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE
+ source.append(" unsigned int coarse_index = coarse_ids[i]; \n");
+
+ source.append(" unsigned int j_stop = influences_row[i + 1]; \n");
+ source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n");
+ source.append(" unsigned int influenced_point_id = influences_id[j]; \n");
+ source.append(" coarse_ids[influenced_point_id] = coarse_index; \n");
+ source.append(" if (influenced_point_id != i) point_types[influenced_point_id] = 2; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+
+
+template<typename StringT>
+void generate_amg_agg_merge_undecided(StringT & source)
+{
+
+ source.append(" __kernel void amg_agg_merge_undecided( \n");
+ source.append(" __global unsigned int *point_types, \n");
+ source.append(" __global unsigned int *coarse_ids, \n");
+ source.append(" __global unsigned int const *influences_row, \n");
+ source.append(" __global unsigned int const *influences_id, \n");
+ source.append(" unsigned int size) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" if (point_types[i] == 0) { \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
+
+ source.append(" unsigned int j_stop = influences_row[i + 1]; \n");
+ source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n");
+ source.append(" unsigned int influenced_point_id = influences_id[j]; \n");
+ source.append(" if (point_types[influenced_point_id] != 0) { \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED
+ source.append(" coarse_ids[i] = coarse_ids[influenced_point_id]; \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_amg_agg_merge_undecided_2(StringT & source)
+{
+
+ source.append(" __kernel void amg_agg_merge_undecided_2( \n");
+ source.append(" __global unsigned int *point_types, \n");
+ source.append(" unsigned int size) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append(" if (point_types[i] == 0) point_types[i] = 2; \n"); // POINT_TYPE_UNDECIDED to POINT_TYPE_FINE
+
+ source.append("} \n");
+}
+
+//////////////////////
+
+template<typename StringT>
+void generate_amg_interpol_ag(StringT & source, std::string const & numeric_string)
+{
+
+ source.append(" __kernel void amg_interpol_ag( \n");
+ source.append(" __global unsigned int * P_row_indices, \n");
+ source.append(" __global unsigned int * P_column_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * P_elements, \n");
+ source.append(" __global const unsigned int * coarse_agg_ids, \n");
+ source.append(" unsigned int size) { \n");
+
+ source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" P_row_indices[i] = i; \n");
+ source.append(" P_column_indices[i] = coarse_agg_ids[i]; \n");
+ source.append(" P_elements[i] = 1; \n");
+ source.append(" } \n");
+ source.append(" if (get_global_id(0) == 0) P_row_indices[size] = size; \n");
+ source.append(" } \n");
+
+}
+
+template<typename StringT>
+void generate_amg_interpol_sa(StringT & source, std::string const & numeric_string)
+{
+
+ source.append("__kernel void amg_interpol_sa( \n");
+ source.append(" __global unsigned int const *A_row_indices, \n");
+ source.append(" __global unsigned int const *A_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" const *A_elements, \n");
+ source.append(" unsigned int A_size1, \n");
+ source.append(" unsigned int A_nnz, \n");
+ source.append(" __global unsigned int *Jacobi_row_indices, \n");
+ source.append(" __global unsigned int *Jacobi_col_indices, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *Jacobi_elements, \n");
+ source.append(" "); source.append(numeric_string); source.append(" omega) { \n");
+
+ source.append(" for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int row_begin = A_row_indices[row]; \n");
+ source.append(" unsigned int row_end = A_row_indices[row+1]; \n");
+
+ source.append(" Jacobi_row_indices[row] = row_begin; \n");
+
+ // Step 1: Extract diagonal:
+ source.append(" "); source.append(numeric_string); source.append(" diag = 0; \n");
+ source.append(" for (unsigned int j = row_begin; j < row_end; ++j) { \n");
+ source.append(" if (A_col_indices[j] == row) { \n");
+ source.append(" diag = A_elements[j]; \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ // Step 2: Write entries:
+ source.append(" for (unsigned int j = row_begin; j < row_end; ++j) { \n");
+ source.append(" unsigned int col_index = A_col_indices[j]; \n");
+ source.append(" Jacobi_col_indices[j] = col_index; \n");
+ source.append(" Jacobi_elements[j] = (col_index == row) ? (1 - omega) : (-omega * A_elements[j] / diag); \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append(" if (get_global_id(0) == 0) Jacobi_row_indices[A_size1] = A_nnz; \n");
+ source.append("} \n");
+
+}
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for compressed_matrix. */
+template<typename NumericT>
+struct amg
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_amg";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(2048);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ generate_amg_influence_trivial(source);
+ generate_amg_pmis2_init_workdata(source);
+ generate_amg_pmis2_max_neighborhood(source);
+ generate_amg_pmis2_mark_mis_nodes(source);
+ generate_amg_pmis2_reset_state(source);
+ generate_amg_agg_propagate_coarse_indices(source);
+ generate_amg_agg_merge_undecided(source);
+ generate_amg_agg_merge_undecided_2(source);
+
+ generate_amg_interpol_ag(source, numeric_string);
+ generate_amg_interpol_sa(source, numeric_string);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
[50/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cpp
new file mode 100644
index 0000000..a7319d5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cpp
@@ -0,0 +1,420 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *index, ViennaCLVector x)
+{
+ viennacl::backend::mem_handle v1_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+ if ((*alpha)->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ (*alpha)->value_float = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ (*alpha)->value_double = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y)
+{
+ if (alpha->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ v2 += alpha->value_float * v1;
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ v2 += alpha->value_double * v1;
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y)
+{
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y)
+{
+ if ((*alpha)->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ (*alpha)->value_float = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ (*alpha)->value_double = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+ if ((*alpha)->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ (*alpha)->value_float = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ (*alpha)->value_double = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector x, ViennaCLVector y,
+ ViennaCLHostScalar c, ViennaCLHostScalar s)
+{
+ if (c->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ if (s->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ viennacl::linalg::plane_rotation(v1, v2, c->value_float, s->value_float);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ viennacl::linalg::plane_rotation(v1, v2, c->value_double, s->value_double);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x)
+{
+ if (alpha->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ v1 *= alpha->value_float;
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ v1 *= alpha->value_double;
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y)
+{
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cu
new file mode 100644
index 0000000..a7319d5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1.cu
@@ -0,0 +1,420 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+#include "init_vector.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLiamax(ViennaCLInt *index, ViennaCLVector x)
+{
+ viennacl::backend::mem_handle v1_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLasum(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+ if ((*alpha)->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ (*alpha)->value_float = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ (*alpha)->value_double = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLaxpy(ViennaCLHostScalar alpha, ViennaCLVector x, ViennaCLVector y)
+{
+ if (alpha->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ v2 += alpha->value_float * v1;
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ v2 += alpha->value_double * v1;
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLcopy(ViennaCLVector x, ViennaCLVector y)
+{
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLdot(ViennaCLHostScalar *alpha, ViennaCLVector x, ViennaCLVector y)
+{
+ if ((*alpha)->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ (*alpha)->value_float = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ (*alpha)->value_double = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLnrm2(ViennaCLHostScalar *alpha, ViennaCLVector x)
+{
+ if ((*alpha)->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ (*alpha)->value_float = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ (*alpha)->value_double = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLrot(ViennaCLVector x, ViennaCLVector y,
+ ViennaCLHostScalar c, ViennaCLHostScalar s)
+{
+ if (c->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ if (s->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ viennacl::linalg::plane_rotation(v1, v2, c->value_float, s->value_float);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ viennacl::linalg::plane_rotation(v1, v2, c->value_double, s->value_double);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLscal(ViennaCLHostScalar alpha, ViennaCLVector x)
+{
+ if (alpha->precision != x->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ v1 *= alpha->value_float;
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+
+ v1 *= alpha->value_double;
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLswap(ViennaCLVector x, ViennaCLVector y)
+{
+ if (x->precision != y->precision)
+ return ViennaCLGenericFailure;
+
+ viennacl::backend::mem_handle v1_handle;
+ viennacl::backend::mem_handle v2_handle;
+
+ if (init_vector(v1_handle, x) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ if (init_vector(v2_handle, y) != ViennaCLSuccess)
+ return ViennaCLGenericFailure;
+
+ switch (x->precision)
+ {
+ case ViennaCLFloat:
+ {
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<float> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+ }
+
+ case ViennaCLDouble:
+ {
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(v1_handle, static_cast<viennacl::vcl_size_t>(x->size), static_cast<viennacl::vcl_size_t>(x->offset), static_cast<difference_type>(x->inc));
+ viennacl::vector_base<double> v2(v2_handle, static_cast<viennacl::vcl_size_t>(y->size), static_cast<viennacl::vcl_size_t>(y->offset), static_cast<difference_type>(y->inc));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+ }
+
+ default:
+ return ViennaCLGenericFailure;
+ }
+}
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_cuda.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_cuda.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_cuda.cu
new file mode 100644
index 0000000..e6dddbb
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_cuda.cu
@@ -0,0 +1,264 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+#ifdef VIENNACL_WITH_CUDA
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ ViennaCLInt *index,
+ float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ ViennaCLInt *index,
+ double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDAScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy,
+ float c, float s)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy,
+ double c, double s)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDASswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, ViennaCLInt incx,
+ float *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<float> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<float> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLCUDADswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, ViennaCLInt incx,
+ double *y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ viennacl::vector_base<double> v1(x, viennacl::CUDA_MEMORY, n, offx, incx);
+ viennacl::vector_base<double> v2(y, viennacl::CUDA_MEMORY, n, offy, incy);
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
+#endif
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cpp
new file mode 100644
index 0000000..07a5097
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cpp
@@ -0,0 +1,293 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ ViennaCLInt *index,
+ float *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ ViennaCLInt *index,
+ double *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy,
+ float c, float s)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy,
+ double c, double s)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cu
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cu b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cu
new file mode 100644
index 0000000..07a5097
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_host.cu
@@ -0,0 +1,293 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiSamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ ViennaCLInt *index,
+ float *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostiDamax(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ ViennaCLInt *index,
+ double *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDasum(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDaxpy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostScopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDcopy(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDdot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *alpha,
+ float *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDnrm2(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *alpha,
+ double *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy,
+ float c, float s)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDrot(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy,
+ double c, double s)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float alpha,
+ float *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDscal(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double alpha,
+ double *x, ViennaCLInt offx, int incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostSswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ float *x, ViennaCLInt offx, int incx,
+ float *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<float> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLHostDswap(ViennaCLBackend /*backend*/, ViennaCLInt n,
+ double *x, ViennaCLInt offx, int incx,
+ double *y, ViennaCLInt offy, int incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, viennacl::MAIN_MEMORY, size_type(n), size_type(offx), difference_type(incx));
+ viennacl::vector_base<double> v2(y, viennacl::MAIN_MEMORY, size_type(n), size_type(offy), difference_type(incy));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cpp b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cpp
new file mode 100644
index 0000000..617b128
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/blas1_opencl.cpp
@@ -0,0 +1,297 @@
+/* =========================================================================
+ Copyright (c) 2010-2014, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+// include necessary system headers
+#include <iostream>
+
+#include "viennacl.hpp"
+#include "viennacl_private.hpp"
+
+//include basic scalar and vector types of ViennaCL
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+
+//include the generic inner product functions of ViennaCL
+#include "viennacl/linalg/inner_prod.hpp"
+
+//include the generic norm functions of ViennaCL
+#include "viennacl/linalg/norm_1.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_inf.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+
+// IxAMAX
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiSamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *index,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLiDamax(ViennaCLBackend backend, ViennaCLInt n,
+ ViennaCLInt *index,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *index = static_cast<ViennaCLInt>(viennacl::linalg::index_norm_inf(v1));
+ return ViennaCLSuccess;
+}
+
+
+
+
+// xASUM
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSasum(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDasum(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::norm_1(v1);
+ return ViennaCLSuccess;
+}
+
+
+
+// xAXPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDaxpy(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v2 += alpha * v1;
+ return ViennaCLSuccess;
+}
+
+
+// xCOPY
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLScopy(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDcopy(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v2 = v1;
+ return ViennaCLSuccess;
+}
+
+// xDOT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSdot(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDdot(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::inner_prod(v1, v2);
+ return ViennaCLSuccess;
+}
+
+
+// xNRM2
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ float *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDnrm2(ViennaCLBackend backend, ViennaCLInt n,
+ double *alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ *alpha = viennacl::linalg::norm_2(v1);
+ return ViennaCLSuccess;
+}
+
+
+// xROT
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSrot(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ float c, float s)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDrot(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy,
+ double c, double s)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ viennacl::linalg::plane_rotation(v1, v2, c, s);
+ return ViennaCLSuccess;
+}
+
+
+
+// xSCAL
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSscal(ViennaCLBackend backend, ViennaCLInt n,
+ float alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDscal(ViennaCLBackend backend, ViennaCLInt n,
+ double alpha,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ v1 *= alpha;
+ return ViennaCLSuccess;
+}
+
+// xSWAP
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLSswap(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<float>::size_type size_type;
+ typedef viennacl::vector_base<float>::size_type difference_type;
+ viennacl::vector_base<float> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<float> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
+
+VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLOpenCLDswap(ViennaCLBackend backend, ViennaCLInt n,
+ cl_mem x, ViennaCLInt offx, ViennaCLInt incx,
+ cl_mem y, ViennaCLInt offy, ViennaCLInt incy)
+{
+ typedef viennacl::vector_base<double>::size_type size_type;
+ typedef viennacl::vector_base<double>::size_type difference_type;
+ viennacl::vector_base<double> v1(x, size_type(n), size_type(offx), difference_type(incx), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+ viennacl::vector_base<double> v2(y, size_type(n), size_type(offy), difference_type(incy), viennacl::ocl::get_context(backend->opencl_backend.context_id));
+
+ viennacl::swap(v1, v2);
+ return ViennaCLSuccess;
+}
+#endif
[06/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp
new file mode 100644
index 0000000..dcd39ad
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp
@@ -0,0 +1,283 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCALAR_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SCALAR_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/scalar.hpp
+ * @brief OpenCL kernel file for scalar operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+/** @brief Enumeration for the scalar type in avbv-like operations */
+enum asbs_scalar_type
+{
+ VIENNACL_ASBS_NONE = 0, // scalar does not exist/contribute
+ VIENNACL_ASBS_CPU,
+ VIENNACL_ASBS_GPU
+};
+
+/** @brief Configuration struct for generating OpenCL kernels for linear combinations of viennacl::scalar<> objects */
+struct asbs_config
+{
+ asbs_config() : with_stride_and_range(true), a(VIENNACL_ASBS_CPU), b(VIENNACL_ASBS_NONE) {}
+
+ bool with_stride_and_range;
+ std::string assign_op;
+ asbs_scalar_type a;
+ asbs_scalar_type b;
+};
+
+// just returns the assignment string
+template<typename StringT>
+void generate_asbs_impl3(StringT & source, char sign_a, char sign_b, asbs_config const & cfg, bool mult_alpha, bool mult_beta)
+{
+ source.append(" *s1 "); source.append(cfg.assign_op); source.append(1, sign_a); source.append(" *s2 ");
+ if (mult_alpha)
+ source.append("* alpha ");
+ else
+ source.append("/ alpha ");
+ if (cfg.b != VIENNACL_ASBS_NONE)
+ {
+ source.append(1, sign_b); source.append(" *s3 ");
+ if (mult_beta)
+ source.append("* beta");
+ else
+ source.append("/ beta");
+ }
+ source.append("; \n");
+}
+
+template<typename StringT>
+void generate_asbs_impl2(StringT & source, char sign_a, char sign_b, asbs_config const & cfg)
+{
+ source.append(" if (options2 & (1 << 1)) { \n");
+ if (cfg.b != VIENNACL_ASBS_NONE)
+ {
+ source.append(" if (options3 & (1 << 1)) \n");
+ generate_asbs_impl3(source, sign_a, sign_b, cfg, false, false);
+ source.append(" else \n");
+ generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true);
+ }
+ else
+ generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true);
+ source.append(" } else { \n");
+ if (cfg.b != VIENNACL_ASBS_NONE)
+ {
+ source.append(" if (options3 & (1 << 1)) \n");
+ generate_asbs_impl3(source, sign_a, sign_b, cfg, true, false);
+ source.append(" else \n");
+ generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true);
+ }
+ else
+ generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true);
+ source.append(" } \n");
+
+}
+
+template<typename StringT>
+void generate_asbs_impl(StringT & source, std::string const & numeric_string, asbs_config const & cfg)
+{
+ source.append("__kernel void as");
+ if (cfg.b != VIENNACL_ASBS_NONE)
+ source.append("bs");
+ if (cfg.assign_op != "=")
+ source.append("_s");
+
+ if (cfg.a == VIENNACL_ASBS_CPU)
+ source.append("_cpu");
+ else if (cfg.a == VIENNACL_ASBS_GPU)
+ source.append("_gpu");
+
+ if (cfg.b == VIENNACL_ASBS_CPU)
+ source.append("_cpu");
+ else if (cfg.b == VIENNACL_ASBS_GPU)
+ source.append("_gpu");
+ source.append("( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * s1, \n");
+ source.append(" \n");
+ if (cfg.a == VIENNACL_ASBS_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" fac2, \n");
+ }
+ else if (cfg.a == VIENNACL_ASBS_GPU)
+ {
+ source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n");
+ }
+ source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+ source.append(" __global const "); source.append(numeric_string); source.append(" * s2");
+
+ if (cfg.b != VIENNACL_ASBS_NONE)
+ {
+ source.append(", \n\n");
+ if (cfg.b == VIENNACL_ASBS_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" fac3, \n");
+ }
+ else if (cfg.b == VIENNACL_ASBS_GPU)
+ {
+ source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n");
+ }
+ source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
+ source.append(" __global const "); source.append(numeric_string); source.append(" * s3");
+ }
+ source.append(") \n{ \n");
+
+ if (cfg.a == VIENNACL_ASBS_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n");
+ }
+ else if (cfg.a == VIENNACL_ASBS_GPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
+ }
+ source.append(" \n");
+
+ if (cfg.b == VIENNACL_ASBS_CPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n");
+ }
+ else if (cfg.b == VIENNACL_ASBS_GPU)
+ {
+ source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
+ }
+
+ source.append(" if (options2 & (1 << 0)) { \n");
+ if (cfg.b != VIENNACL_ASBS_NONE)
+ {
+ source.append(" if (options3 & (1 << 0)) { \n");
+ generate_asbs_impl2(source, '-', '-', cfg);
+ source.append(" } else { \n");
+ generate_asbs_impl2(source, '-', '+', cfg);
+ source.append(" } \n");
+ }
+ else
+ generate_asbs_impl2(source, '-', '+', cfg);
+ source.append(" } else { \n");
+ if (cfg.b != VIENNACL_ASBS_NONE)
+ {
+ source.append(" if (options3 & (1 << 0)) { \n");
+ generate_asbs_impl2(source, '+', '-', cfg);
+ source.append(" } else { \n");
+ generate_asbs_impl2(source, '+', '+', cfg);
+ source.append(" } \n");
+ }
+ else
+ generate_asbs_impl2(source, '+', '+', cfg);
+
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_asbs(StringT & source, std::string const & numeric_string)
+{
+ asbs_config cfg;
+ cfg.assign_op = "=";
+ cfg.with_stride_and_range = true;
+
+ // as
+ cfg.b = VIENNACL_ASBS_NONE; cfg.a = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+ cfg.b = VIENNACL_ASBS_NONE; cfg.a = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+
+ // asbs
+ cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+
+ // asbs
+ cfg.assign_op = "+=";
+
+ cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg);
+ cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg);
+}
+
+template<typename StringT>
+void generate_scalar_swap(StringT & source, std::string const & numeric_string)
+{
+ source.append("__kernel void swap( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * s1, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * s2) \n");
+ source.append("{ \n");
+ source.append(" "); source.append(numeric_string); source.append(" tmp = *s2; \n");
+ source.append(" *s2 = *s1; \n");
+ source.append(" *s1 = tmp; \n");
+ source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for operations involving viennacl::scalar<>, but not viennacl::vector<> or viennacl::matrix<>. */
+template<typename NumericT>
+struct scalar
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_scalar";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(8192);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // fully parametrized kernels:
+ generate_asbs(source, numeric_string);
+ generate_scalar_swap(source, numeric_string);
+
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp
new file mode 100644
index 0000000..9626d2d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp
@@ -0,0 +1,194 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/scan.hpp
+ * @brief OpenCL kernel file for scan operations. To be merged back to vector operations. */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+
+template <typename StringType>
+void generate_scan_kernel_1(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void scan_1(__global "); source.append(numeric_string); source.append("* X, \n");
+ source.append(" unsigned int startX, \n");
+ source.append(" unsigned int incX, \n");
+ source.append(" unsigned int sizeX, \n");
+
+ source.append(" __global "); source.append(numeric_string); source.append("* Y, \n");
+ source.append(" unsigned int startY, \n");
+ source.append(" unsigned int incY, \n");
+
+ source.append(" unsigned int scan_offset, \n"); // 0 for inclusive scan, 1 for exclusive scan
+ source.append(" __global "); source.append(numeric_string); source.append("* carries) { \n");
+
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_buffer[256]; \n");
+ source.append(" "); source.append(numeric_string); source.append(" my_value; \n");
+
+ source.append(" unsigned int work_per_thread = (sizeX - 1) / get_global_size(0) + 1; \n");
+ source.append(" unsigned int block_start = work_per_thread * get_local_size(0) * get_group_id(0); \n");
+ source.append(" unsigned int block_stop = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
+ source.append(" unsigned int block_offset = 0; \n");
+
+ // run scan on each section:
+ source.append(" for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) { \n");
+
+ // load data
+ source.append(" my_value = (i < sizeX) ? X[i * incX + startX] : 0; \n");
+
+ // inclusive scan in shared buffer:
+ source.append(" for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_buffer[get_local_id(0)] = my_value; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) >= stride) \n");
+ source.append(" my_value += shared_buffer[get_local_id(0) - stride]; \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_buffer[get_local_id(0)] = my_value; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // write to output array:
+ source.append(" if (scan_offset > 0) \n");
+ source.append(" my_value = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n");
+
+ source.append(" if (i < sizeX) \n");
+ source.append(" Y[i * incY + startY] = block_offset + my_value; \n");
+
+ source.append(" block_offset += shared_buffer[get_local_size(0)-1]; \n");
+ source.append(" } \n");
+
+ // write carry:
+ source.append(" if (get_local_id(0) == 0) carries[get_group_id(0)] = block_offset; \n");
+
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_scan_kernel_2(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void scan_2(__global "); source.append(numeric_string); source.append("* carries) { \n");
+
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_buffer[256]; \n"); //section size
+
+ // load data
+ source.append(" "); source.append(numeric_string); source.append(" my_carry = carries[get_local_id(0)]; \n");
+
+ // scan in shared buffer:
+ source.append(" for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_buffer[get_local_id(0)] = my_carry; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) >= stride) \n");
+ source.append(" my_carry += shared_buffer[get_local_id(0) - stride]; \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" shared_buffer[get_local_id(0)] = my_carry; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // write to output array:
+ source.append(" carries[get_local_id(0)] = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n");
+
+ source.append("} \n");
+}
+
+template <typename StringType>
+void generate_scan_kernel_3(StringType & source, std::string const & numeric_string)
+{
+ source.append("__kernel void scan_3(__global "); source.append(numeric_string); source.append(" * Y, \n");
+ source.append(" unsigned int startY, \n");
+ source.append(" unsigned int incY, \n");
+ source.append(" unsigned int sizeY, \n");
+
+ source.append(" __global "); source.append(numeric_string); source.append("* carries) { \n");
+
+ source.append(" unsigned int work_per_thread = (sizeY - 1) / get_global_size(0) + 1; \n");
+ source.append(" unsigned int block_start = work_per_thread * get_local_size(0) * get_group_id(0); \n");
+ source.append(" unsigned int block_stop = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n");
+
+ source.append(" __local "); source.append(numeric_string); source.append(" shared_offset; \n");
+
+ source.append(" if (get_local_id(0) == 0) shared_offset = carries[get_group_id(0)]; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ source.append(" for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) \n");
+ source.append(" if (i < sizeY) \n");
+ source.append(" Y[i * incY + startY] += shared_offset; \n");
+
+ source.append("} \n");
+}
+
+
+
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices. */
+template<typename NumericT>
+struct scan
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_scan";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ generate_scan_kernel_1(source, numeric_string);
+ generate_scan_kernel_2(source, numeric_string);
+ generate_scan_kernel_3(source, numeric_string);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp
new file mode 100644
index 0000000..562cb52
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp
@@ -0,0 +1,135 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SLICED_ELL_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SLICED_ELL_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp
+ * @brief OpenCL kernel file for sliced_ell_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_sliced_ell_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+ if (with_alpha_beta)
+ source.append("__kernel void vec_mul_alpha_beta( \n");
+ else
+ source.append("__kernel void vec_mul( \n");
+ source.append(" __global const unsigned int * columns_per_block, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const unsigned int * block_start, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
+ source.append(" uint4 layout_x, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); }
+ source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
+ source.append(" uint4 layout_result, \n");
+ if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" beta, \n"); }
+ source.append(" unsigned int block_size) \n");
+ source.append("{ \n");
+ source.append(" uint blocks_per_workgroup = get_local_size(0) / block_size; \n");
+ source.append(" uint id_in_block = get_local_id(0) % block_size; \n");
+ source.append(" uint num_blocks = (layout_result.z - 1) / block_size + 1; \n");
+ source.append(" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n");
+ source.append(" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n");
+
+ source.append(" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n");
+ source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n");
+
+ source.append(" uint row = block_idx * block_size + id_in_block; \n");
+ source.append(" uint offset = block_start[block_idx]; \n");
+ source.append(" uint num_columns = columns_per_block[block_idx]; \n");
+ source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
+ source.append(" uint index = offset + item_id * block_size + id_in_block; \n");
+ source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n");
+ source.append(" sum += (val != 0) ? (x[column_indices[index] * layout_x.y + layout_x.x] * val) : 0; \n");
+ source.append(" } \n");
+
+ source.append(" if (row < layout_result.z) \n");
+ if (with_alpha_beta)
+ source.append(" result[row * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+ else
+ source.append(" result[row * layout_result.y + layout_result.x] = sum; \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for ell_matrix. */
+template<typename NumericT, typename IndexT>
+struct sliced_ell_matrix;
+
+template<typename NumericT>
+struct sliced_ell_matrix<NumericT, unsigned int>
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + viennacl::ocl::type_to_string<unsigned int>::apply() + "_sliced_ell_matrix";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ // fully parametrized kernels:
+ generate_sliced_ell_vec_mul(source, numeric_string, true);
+ generate_sliced_ell_vec_mul(source, numeric_string, false);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp
new file mode 100644
index 0000000..19ac991
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp
@@ -0,0 +1,631 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/spai.hpp
+ * @brief OpenCL kernel file for sparse approximate inverse operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_spai_assemble_blocks(StringT & source, std::string const & numeric_string)
+{
+ source.append("float get_element(__global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" unsigned int row, \n");
+ source.append(" unsigned int col) \n");
+ source.append("{ \n");
+ source.append(" unsigned int row_end = row_indices[row+1]; \n");
+ source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i){ \n");
+ source.append(" if (column_indices[i] == col) \n");
+ source.append(" return elements[i]; \n");
+ source.append(" if (column_indices[i] > col) \n");
+ source.append(" return 0; \n");
+ source.append(" } \n");
+ source.append(" return 0; \n");
+ source.append("} \n");
+
+ source.append("void block_assembly(__global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const unsigned int * matrix_dimensions, \n");
+ source.append(" __global const unsigned int * set_I, \n");
+ source.append(" __global const unsigned int * set_J, \n");
+ source.append(" unsigned int matrix_ind, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J) \n");
+ source.append("{ \n");
+ source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
+ source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
+
+ source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
+ //start row index
+ source.append(" for (unsigned int j = 0; j < row_n; j++){ \n");
+ source.append(" com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("__kernel void assemble_blocks( \n");
+ source.append(" __global const unsigned int * row_indices, \n");
+ source.append(" __global const unsigned int * column_indices, \n");
+ source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append(" __global const unsigned int * set_I, \n");
+ source.append(" __global const unsigned int * set_J, \n");
+ source.append(" __global const unsigned int * i_ind, \n");
+ source.append(" __global const unsigned int * j_ind, \n");
+ source.append(" __global const unsigned int * block_ind, \n");
+ source.append(" __global const unsigned int * matrix_dimensions, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J, \n");
+ source.append(" __global unsigned int * g_is_update, \n");
+ source.append(" unsigned int block_elems_num) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+ source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+ source.append(" block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+}
+
+template<typename StringT>
+void generate_spai_block_bv_assembly(StringT & source, std::string const & numeric_string)
+{
+ source.append(" void assemble_bv(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n){ \n");
+ source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
+ source.append(" g_bv_r[i] = g_bv[ i]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" void assemble_bv_block(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, unsigned int col_n_u) \n");
+ source.append(" { \n");
+ source.append(" assemble_bv(g_bv_r, g_bv, col_n); \n");
+ source.append(" assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n");
+ source.append(" } \n");
+
+ source.append(" __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(" * g_bv, \n");
+ source.append(" __global unsigned int * start_bv_ind, \n");
+ source.append(" __global unsigned int * matrix_dimensions, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, \n");
+ source.append(" __global unsigned int * start_bv_u_ind, \n");
+ source.append(" __global unsigned int * matrix_dimensions_u, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_r, \n");
+ source.append(" __global unsigned int * start_bv_r_ind, \n");
+ source.append(" __global unsigned int * matrix_dimensions_r, \n");
+ source.append(" __global unsigned int * g_is_update, \n");
+ source.append(" //__local "); source.append(numeric_string); source.append(" * local_gb, \n");
+ source.append(" unsigned int block_elems_num) \n");
+ source.append(" { \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+ source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+ source.append(" assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+}
+
+template<typename StringT>
+void generate_spai_block_least_squares(StringT & source, std::string const & numeric_string)
+{
+ source.append("void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
+ source.append(" *res = 0.0; \n");
+ source.append(" for (unsigned int j = ind; j < row_n; ++j){ \n");
+ source.append(" if (j == ind){ \n");
+ source.append(" *res += v[ j]; \n");
+ source.append(" }else{ \n");
+ source.append(" *res += A[ j + ind*row_n]*v[ j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void backwardSolve(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * y, __global "); source.append(numeric_string); source.append(" * x){ \n");
+ source.append(" for (int i = col_n-1; i >= 0; i--) { \n");
+ source.append(" x[ i] = y[ i]; \n");
+ source.append(" for (int j = i+1; j < col_n; ++j) { \n");
+ source.append(" x[ i] -= R[ i + j*row_n]*x[ j]; \n");
+ source.append(" } \n");
+ source.append(" x[i] /= R[ i + i*row_n]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+
+ source.append("void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * y){ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
+ source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
+ source.append(" custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n");
+ source.append(" for (unsigned int j = i; j < row_n; ++j){ \n");
+ source.append(" if (i == j){ \n");
+ source.append(" y[ j] -= b_v[ i]*inn_prod; \n");
+ source.append(" } \n");
+ source.append(" else{ \n");
+ source.append(" y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append("void ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * m_v, __global "); source.append(numeric_string); source.append(" * y_v){ \n");
+ source.append(" apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n");
+ source.append(" //m_new - is m_v now \n");
+ source.append(" backwardSolve(R, row_n, col_n, y_v, m_v); \n");
+ source.append("} \n");
+
+ source.append("__kernel void block_least_squares( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * global_R, \n");
+ source.append(" __global unsigned int * block_ind, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n");
+ source.append(" __global unsigned int * start_bv_inds, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * m_v, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * y_v, \n");
+ source.append(" __global unsigned int * start_y_inds, \n");
+ source.append(" __global unsigned int * matrix_dimensions, \n");
+ source.append(" __global unsigned int * g_is_update, \n");
+ source.append(" unsigned int block_elems_num) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+ source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+ source.append(" ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_q_mult(StringT & source, std::string const & numeric_string)
+{
+ source.append("void custom_dot_prod(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
+ source.append(" *res = 0.0; \n");
+ source.append(" for (unsigned int j = ind; j < row_n; ++j){ \n");
+ source.append(" if (j == ind){ \n");
+ source.append(" *res += v[j]; \n");
+ source.append(" }else{ \n");
+ source.append(" *res += A[j + ind*row_n]*v[j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * y){ \n");
+ source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
+ source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
+ source.append(" custom_dot_prod(R, row_n, y, i, &inn_prod); \n");
+ source.append(" for (unsigned int j = i; j < row_n; ++j){ \n");
+ source.append(" if (i == j){ \n");
+ source.append(" y[j] -= b_v[ i]*inn_prod; \n");
+ source.append(" } \n");
+ source.append(" else{ \n");
+ source.append(" y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void q_mult(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * R_u, unsigned int col_n_u){ \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n");
+ source.append(" apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+ source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
+ source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+ source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
+ source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(" * global_R, \n");
+ source.append(" __global unsigned int * block_ind, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * global_R_u, \n");
+ source.append(" __global unsigned int *block_ind_u, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n");
+ source.append(" __global unsigned int * start_bv_inds, \n");
+ source.append(" __global unsigned int * matrix_dimensions, \n");
+ source.append(" __global unsigned int * matrix_dimensions_u, \n");
+ source.append(" __global unsigned int * g_is_update, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * local_R_u, \n");
+ source.append(" unsigned int block_elems_num){ \n");
+ source.append(" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
+ source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n");
+ //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+ source.append(" matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n");
+ source.append(" matrix_dimensions_u[2*i + 1]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_qr(StringT & source, std::string const & numeric_string)
+{
+ source.append("void dot_prod(__local const "); source.append(numeric_string); source.append("* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append("* res){ \n");
+ source.append(" *res = 0; \n");
+ source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
+ source.append(" *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void vector_div(__global "); source.append(numeric_string); source.append("* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(" b, unsigned int n){ \n");
+ source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
+ source.append(" v[i] /= b; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void copy_vector(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, const unsigned int beg_ind, const unsigned int n){ \n");
+ source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n");
+ source.append(" v[i] = A[(beg_ind-1)*n + i]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+
+ source.append("void householder_vector(__local const "); source.append(numeric_string); source.append("* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append("* v, __global "); source.append(numeric_string); source.append("* b){ \n");
+ source.append(" "); source.append(numeric_string); source.append(" sg; \n");
+ source.append(" dot_prod(A, n, j+1, &sg); \n");
+ source.append(" copy_vector(A, v, j+1, n); \n");
+ source.append(" "); source.append(numeric_string); source.append(" mu; \n");
+ source.append(" v[j] = 1.0; \n");
+ //print_contigious_vector(v, v_start_ind, n);
+ source.append(" if (sg == 0){ \n");
+ source.append(" *b = 0; \n");
+ source.append(" } \n");
+ source.append(" else{ \n");
+ source.append(" mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n");
+ source.append(" if (A[ j*n + j] <= 0){ \n");
+ source.append(" v[j] = A[ j*n + j] - mu; \n");
+ source.append(" }else{ \n");
+ source.append(" v[j] = -sg/(A[ j*n + j] + mu); \n");
+ source.append(" } \n");
+ source.append(" *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n");
+ //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));
+ source.append(" vector_div(v, j, v[j], n); \n");
+ //print_contigious_vector(v, v_start_ind, n);
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void custom_inner_prod(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append("* res){ \n");
+ source.append(" for (unsigned int i = start_ind; i < row_num; ++i){ \n");
+ source.append(" *res += A[col_ind*row_num + i]*v[i]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+ //
+ source.append("void apply_householder_reflection(__local "); source.append(numeric_string); source.append("* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append("* v, "); source.append(numeric_string); source.append(" b){ \n");
+ source.append(" "); source.append(numeric_string); source.append(" in_prod_res; \n");
+ source.append(" for (unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n");
+ source.append(" in_prod_res = 0.0; \n");
+ source.append(" custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n");
+ source.append(" for (unsigned int j = iter_cnt; j < row_n; ++j){ \n");
+ source.append(" A[ i*row_n + j] -= b*in_prod_res* v[j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void store_householder_vector(__local "); source.append(numeric_string); source.append("* A, unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append("* v){ \n");
+ source.append(" for (unsigned int i = ind; i < n; ++i){ \n");
+ source.append(" A[ (ind-1)*n + i] = v[i]; \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void single_qr( __local "); source.append(numeric_string); source.append("* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append("* b_v, __global "); source.append(numeric_string); source.append("* v, unsigned int matrix_ind){ \n");
+ //matrix_dimensions[0] - number of rows
+ //matrix_dimensions[1] - number of columns
+ source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
+ source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
+
+ source.append(" if ((col_n == row_n)&&(row_n == 1)){ \n");
+ source.append(" b_v[0] = 0.0; \n");
+ source.append(" return; \n");
+ source.append(" } \n");
+ source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
+ source.append(" if (get_local_id(0) == 0){ \n");
+ source.append(" householder_vector(R, i, row_n, v, b_v + i); \n");
+ source.append(" } \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" if (get_local_id(0) == 0){ \n");
+ source.append(" if (i < matrix_dimensions[2*matrix_ind]){ \n");
+ source.append(" store_householder_vector(R, i+1, row_n, v); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+ source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
+ source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+ source.append("void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
+ source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
+ source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
+ source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+
+ source.append("__kernel void block_qr( \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* R, \n");
+ source.append(" __global unsigned int* matrix_dimensions, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* b_v, \n");
+ source.append(" __global "); source.append(numeric_string); source.append("* v, \n");
+ source.append(" __global unsigned int* start_matrix_inds, \n");
+ source.append(" __global unsigned int* start_bv_inds, \n");
+ source.append(" __global unsigned int* start_v_inds, \n");
+ source.append(" __global unsigned int * g_is_update, \n");
+ source.append(" __local "); source.append(numeric_string); source.append("* local_buff_R, \n");
+ source.append(" unsigned int block_elems_num){ \n");
+ source.append(" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
+ source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+ source.append(" matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+ source.append(" matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_qr_assembly(StringT & source, std::string const & numeric_string)
+{
+ source.append("void assemble_upper_part(__global "); source.append(numeric_string); source.append(" * R_q, \n");
+ source.append(" unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
+ source.append(" unsigned int row_n_u, unsigned int col_n_u, \n");
+ source.append(" unsigned int col_n, unsigned int diff){ \n");
+ source.append(" for (unsigned int i = 0; i < col_n_q; ++i){ \n");
+ source.append(" for (unsigned int j = 0; j < diff; ++j){ \n");
+ source.append(" R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append("void assemble_lower_part(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+ source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u, \n");
+ source.append(" unsigned int diff){ \n");
+ source.append(" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
+ source.append(" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
+ source.append(" R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void assemble_qr_block(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
+ source.append(" unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n");
+ source.append(" unsigned int diff = row_n_u - col_n; \n");
+ source.append(" assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
+ source.append(" if (diff > 0){ \n");
+ source.append(" assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("__kernel void block_qr_assembly( \n");
+ source.append(" __global unsigned int * matrix_dimensions, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
+ source.append(" __global unsigned int * block_ind_u, \n");
+ source.append(" __global unsigned int * matrix_dimensions_u, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+ source.append(" __global unsigned int * block_ind_u_u, \n");
+ source.append(" __global unsigned int * matrix_dimensions_u_u, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n");
+ source.append(" __global unsigned int * block_ind_q, \n");
+ source.append(" __global unsigned int * matrix_dimensions_q, \n");
+ source.append(" __global unsigned int * g_is_update, \n");
+ source.append(" //__local "); source.append(numeric_string); source.append(" * local_R_q, \n");
+ source.append(" unsigned int block_elems_num) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+ source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+ source.append(" assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
+ source.append(" matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_qr_assembly_1(StringT & source, std::string const & numeric_string)
+{
+ source.append("void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
+ source.append(" unsigned int row_n_u, unsigned int col_n_u, \n");
+ source.append(" unsigned int col_n, unsigned int diff){ \n");
+ source.append(" for (unsigned int i = 0; i < col_n_q; ++i){ \n");
+ source.append(" for (unsigned int j = 0; j < diff; ++j){ \n");
+ source.append(" R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+
+ source.append("void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
+ source.append(" unsigned int col_n_u, unsigned int col_n){ \n");
+ source.append(" unsigned int diff = row_n_u - col_n; \n");
+ source.append(" assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
+ source.append("} \n");
+
+ source.append("__kernel void block_qr_assembly_1( \n");
+ source.append(" __global unsigned int * matrix_dimensions, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
+ source.append(" __global unsigned int * block_ind_u, \n");
+ source.append(" __global unsigned int * matrix_dimensions_u, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n");
+ source.append(" __global unsigned int * block_ind_q, \n");
+ source.append(" __global unsigned int * matrix_dimensions_q, \n");
+ source.append(" __global unsigned int * g_is_update, \n");
+ source.append(" unsigned int block_elems_num) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+ source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+ source.append(" assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
+ source.append(" matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+template<typename StringT>
+void generate_spai_block_r_assembly(StringT & source, std::string const & numeric_string)
+{
+ source.append("void assemble_r(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, \n");
+ source.append(" unsigned int row_n, unsigned int col_n) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n");
+ source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n");
+ source.append(" gR[i*row_n_r + j] = R[i*row_n + j ]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void assemble_r_u(__global "); source.append(numeric_string); source.append(" * gR, \n");
+ source.append(" unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n");
+ source.append(" unsigned int col_n) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = 0; i < col_n_u; ++i){ \n");
+ source.append(" for (unsigned int j = 0; j < col_n; ++j){ \n");
+ source.append(" gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+
+ source.append("void assemble_r_u_u(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, \n");
+ source.append(" unsigned int col_n_u_u, unsigned int col_n) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n");
+ source.append(" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n");
+ source.append(" gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+
+ source.append("void assemble_r_block(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, \n");
+ source.append(" unsigned int col_n, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+ source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u){ \n");
+ source.append(" assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n");
+ source.append(" assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n");
+ source.append(" assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n");
+ source.append("} \n");
+
+
+ source.append("__kernel void block_r_assembly( \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R, \n");
+ source.append(" __global unsigned int * block_ind, \n");
+ source.append(" __global unsigned int * matrix_dimensions, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n");
+ source.append(" __global unsigned int * block_ind_u, \n");
+ source.append(" __global unsigned int * matrix_dimensions_u, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
+ source.append(" __global unsigned int * block_ind_u_u, \n");
+ source.append(" __global unsigned int * matrix_dimensions_u_u, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" * g_R, \n");
+ source.append(" __global unsigned int * block_ind_r, \n");
+ source.append(" __global unsigned int * matrix_dimensions_r, \n");
+ source.append(" __global unsigned int * g_is_update, \n");
+ source.append(" unsigned int block_elems_num) \n");
+ source.append("{ \n");
+ source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
+ source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
+
+ source.append(" assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n");
+ source.append(" matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n");
+ source.append(" R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n");
+
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append("} \n");
+}
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners. */
+template<typename NumericT>
+struct spai
+{
+ static std::string program_name()
+ {
+ return viennacl::ocl::type_to_string<NumericT>::apply() + "_spai";
+ }
+
+ static void init(viennacl::ocl::context & ctx)
+ {
+ static std::map<cl_context, bool> init_done;
+ if (!init_done[ctx.handle().get()])
+ {
+ viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+ std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+ std::string source;
+ source.reserve(1024);
+
+ viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+ generate_spai_assemble_blocks(source, numeric_string);
+ generate_spai_block_bv_assembly(source, numeric_string);
+ generate_spai_block_least_squares(source, numeric_string);
+ generate_spai_block_q_mult(source, numeric_string);
+ generate_spai_block_qr(source, numeric_string);
+ generate_spai_block_qr_assembly(source, numeric_string);
+ generate_spai_block_qr_assembly_1(source, numeric_string);
+ generate_spai_block_r_assembly(source, numeric_string);
+
+ std::string prog_name = program_name();
+ #ifdef VIENNACL_BUILD_INFO
+ std::cout << "Creating program " << prog_name << std::endl;
+ #endif
+ ctx.add_program(source, prog_name);
+ init_done[ctx.handle().get()] = true;
+ } //if
+ } //init
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+#endif
+
[36/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
new file mode 100644
index 0000000..4330431
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/iterative_operations.hpp
@@ -0,0 +1,2049 @@
+#ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/iterative_operations.hpp
+ @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+//
+// CG vector update:
+//
+
+// cpu scalar
+template<typename NumericT>
+__global__ void pipelined_cg_vector_kernel(NumericT * result,
+ NumericT alpha,
+ NumericT * p,
+ NumericT * r,
+ NumericT const * Ap,
+ NumericT beta,
+ NumericT * inner_prod_buffer,
+ unsigned int size)
+{
+ NumericT inner_prod_contrib = 0;
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ NumericT value_p = p[i];
+ NumericT value_r = r[i];
+
+ result[i] += alpha * value_p;
+ value_r -= alpha * Ap[i];
+ value_p = value_r + beta * value_p;
+
+ p[i] = value_p;
+ r[i] = value_r;
+ inner_prod_contrib += value_r * value_r;
+ }
+
+ // parallel reduction in work group
+ __shared__ NumericT shared_array[256];
+ shared_array[threadIdx.x] = inner_prod_contrib;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0)
+ inner_prod_buffer[blockIdx.x] = shared_array[0];
+}
+
+
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+ NumericT alpha,
+ vector_base<NumericT> & p,
+ vector_base<NumericT> & r,
+ vector_base<NumericT> const & Ap,
+ NumericT beta,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ unsigned int size = result.size();
+ pipelined_cg_vector_kernel<<<128, 128>>>(viennacl::cuda_arg(result),
+ alpha,
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(r),
+ viennacl::cuda_arg(Ap),
+ beta,
+ viennacl::cuda_arg(inner_prod_buffer),
+ size);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_vector_kernel");
+}
+
+
+
+
+//
+// Compressed matrix
+//
+
+
+template<unsigned int SubWarpSizeV, typename NumericT>
+__global__ void pipelined_cg_csr_vec_mul_blocked_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ const NumericT * p,
+ NumericT * Ap,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size)
+{
+ __shared__ NumericT shared_elements[256];
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+
+ const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
+ const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
+ const unsigned int block_start = blockIdx.x * block_increment;
+ const unsigned int block_stop = min(block_start + block_increment, size);
+
+ for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
+ row < block_stop;
+ row += blockDim.x / SubWarpSizeV)
+ {
+ NumericT dot_prod = NumericT(0);
+ unsigned int row_end = row_indices[row+1];
+ for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
+ dot_prod += elements[i] * p[column_indices[i]];
+
+ shared_elements[threadIdx.x] = dot_prod;
+ if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
+ if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
+ if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
+ if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
+ if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
+
+ if (id_in_row == 0)
+ {
+ Ap[row] = shared_elements[threadIdx.x];
+ inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
+ inner_prod_pAp += p[row] * shared_elements[threadIdx.x];
+ }
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ }
+
+}
+
+template<typename NumericT>
+__global__ void pipelined_cg_csr_vec_mul_adaptive_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const unsigned int * row_blocks,
+ const NumericT * elements,
+ unsigned int num_blocks,
+ const NumericT * p,
+ NumericT * Ap,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+
+ __shared__ NumericT shared_elements[1024];
+
+ for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
+ {
+ unsigned int row_start = row_blocks[block_id];
+ unsigned int row_stop = row_blocks[block_id + 1];
+ unsigned int element_start = row_indices[row_start];
+ unsigned int element_stop = row_indices[row_stop];
+ unsigned int rows_to_process = row_stop - row_start;
+
+ if (rows_to_process > 1) // CSR stream with one thread per row
+ {
+ // load to shared buffer:
+ for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+ shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
+
+ __syncthreads();
+
+ // use one thread per row to sum:
+ for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
+ {
+ NumericT dot_prod = 0;
+ unsigned int thread_row_start = row_indices[row] - element_start;
+ unsigned int thread_row_stop = row_indices[row + 1] - element_start;
+ for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
+ dot_prod += shared_elements[i];
+ Ap[row] = dot_prod;
+ inner_prod_ApAp += dot_prod * dot_prod;
+ inner_prod_pAp += p[row] * dot_prod;
+ }
+ }
+ // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
+ else // CSR vector for a single row
+ {
+ // load and sum to shared buffer:
+ shared_elements[threadIdx.x] = 0;
+ for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+ shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
+
+ // reduction to obtain final result
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
+ }
+
+ if (threadIdx.x == 0)
+ {
+ Ap[row_start] = shared_elements[0];
+ inner_prod_ApAp += shared_elements[0] * shared_elements[0];
+ inner_prod_pAp += p[row_start] * shared_elements[0];
+ }
+ }
+
+ __syncthreads(); // avoid race conditions
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ }
+}
+
+
+
+
+template<typename NumericT>
+void pipelined_cg_prod(compressed_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
+ if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
+ {
+ pipelined_cg_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+#else
+ if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
+ {
+ pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
+#endif
+ viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
+ }
+ else
+ {
+ pipelined_cg_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.blocks1()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel");
+ }
+}
+
+
+//
+// Coordinate Matrix
+//
+
+
+template<typename NumericT>
+__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+ const NumericT * elements,
+ const unsigned int * group_boundaries,
+ const NumericT * p,
+ NumericT * Ap,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ __shared__ unsigned int shared_rows[128];
+ __shared__ NumericT inter_results[128];
+
+ uint2 tmp;
+ NumericT val;
+ unsigned int group_start = group_boundaries[blockIdx.x];
+ unsigned int group_end = group_boundaries[blockIdx.x + 1];
+ unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+ unsigned int local_index = 0;
+
+ for (unsigned int k = 0; k < k_end; ++k)
+ {
+ local_index = group_start + k * blockDim.x + threadIdx.x;
+
+ tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+ val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
+
+ //check for carry from previous loop run:
+ if (threadIdx.x == 0 && k > 0)
+ {
+ if (tmp.x == shared_rows[blockDim.x-1])
+ val += inter_results[blockDim.x-1];
+ else
+ {
+ NumericT Ap_entry = inter_results[blockDim.x-1];
+ Ap[shared_rows[blockDim.x-1]] = Ap_entry;
+ inner_prod_ApAp += Ap_entry * Ap_entry;
+ inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
+ }
+ }
+
+ //segmented parallel reduction begin
+ __syncthreads();
+ shared_rows[threadIdx.x] = tmp.x;
+ inter_results[threadIdx.x] = val;
+ NumericT left = 0;
+ __syncthreads();
+
+ for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+ {
+ left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+ __syncthreads();
+ inter_results[threadIdx.x] += left;
+ __syncthreads();
+ }
+ //segmented parallel reduction end
+
+ if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+ shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+ {
+ NumericT Ap_entry = inter_results[threadIdx.x];
+ Ap[tmp.x] = Ap_entry;
+ inner_prod_ApAp += Ap_entry * Ap_entry;
+ inner_prod_pAp += Ap_entry * p[tmp.x];
+ }
+
+ __syncthreads();
+ } //for k
+
+ if (local_index + 1 == group_end)
+ {
+ NumericT Ap_entry = inter_results[threadIdx.x];
+ Ap[tmp.x] = Ap_entry;
+ inner_prod_ApAp += Ap_entry * Ap_entry;
+ inner_prod_pAp += Ap_entry * p[tmp.x];
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ }
+
+}
+
+
+template<typename NumericT>
+void pipelined_cg_prod(coordinate_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+ Ap.clear();
+
+ pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
+}
+
+
+
+//
+// ELL Matrix
+//
+
+template<typename NumericT>
+__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int * coords,
+ const NumericT * elements,
+ unsigned int internal_row_num,
+ unsigned int items_per_row,
+ const NumericT * p,
+ NumericT * Ap,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ for (unsigned int row = glb_id; row < size; row += glb_sz)
+ {
+ NumericT sum = 0;
+
+ unsigned int offset = row;
+ for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+ {
+ NumericT val = elements[offset];
+ sum += val ? p[coords[offset]] * val : NumericT(0);
+ }
+
+ Ap[row] = sum;
+ inner_prod_ApAp += sum * sum;
+ inner_prod_pAp += sum * p[row];
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ }
+}
+
+
+template<typename NumericT>
+void pipelined_cg_prod(ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+ pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.internal_size1()),
+ static_cast<unsigned int>(A.maxnnz()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
+}
+
+
+//
+// SELL-C-\sigma Matrix
+//
+
+template<typename NumericT>
+__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
+ const unsigned int * column_indices,
+ const unsigned int * block_start,
+ const NumericT * elements,
+ const NumericT * p,
+ NumericT * Ap,
+ unsigned int size,
+ unsigned int block_size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+
+ unsigned int blocks_per_threadblock = blockDim.x / block_size;
+ unsigned int id_in_block = threadIdx.x % block_size;
+ unsigned int num_blocks = (size - 1) / block_size + 1;
+ unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
+ unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
+
+ for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
+ {
+ unsigned int row = block_idx * block_size + id_in_block;
+ unsigned int offset = block_start[block_idx];
+ unsigned int num_columns = columns_per_block[block_idx];
+
+ NumericT sum = 0;
+ for (unsigned int item_id = 0; item_id < num_columns; item_id++)
+ {
+ unsigned int index = offset + item_id * block_size + id_in_block;
+ NumericT val = elements[index];
+
+ sum += val ? (p[column_indices[index]] * val) : 0;
+ }
+
+ if (row < size)
+ {
+ Ap[row] = sum;
+ inner_prod_ApAp += sum * sum;
+ inner_prod_pAp += sum * p[row];
+ }
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ }
+}
+
+template<typename NumericT>
+void pipelined_cg_prod(sliced_ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+ pipelined_cg_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ size,
+ static_cast<unsigned int>(A.rows_per_block()),
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
+}
+
+
+//
+// Hybrid Matrix
+//
+
+
+template<typename NumericT>
+__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int * ell_coords,
+ const NumericT * ell_elements,
+ const unsigned int * csr_rows,
+ const unsigned int * csr_cols,
+ const NumericT * csr_elements,
+ unsigned int internal_row_num,
+ unsigned int items_per_row,
+ const NumericT * p,
+ NumericT * Ap,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ for (unsigned int row = glb_id; row < size; row += glb_sz)
+ {
+ NumericT sum = 0;
+
+ unsigned int offset = row;
+ for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+ {
+ NumericT val = ell_elements[offset];
+
+ sum += val ? p[ell_coords[offset]] * val : NumericT(0);
+ }
+
+ unsigned int col_begin = csr_rows[row];
+ unsigned int col_end = csr_rows[row + 1];
+
+ for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
+ {
+ sum += p[csr_cols[item_id]] * csr_elements[item_id];
+ }
+
+ Ap[row] = sum;
+ inner_prod_ApAp += sum * sum;
+ inner_prod_pAp += sum * p[row];
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ }
+}
+
+
+
+template<typename NumericT>
+void pipelined_cg_prod(hyb_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+ pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg<unsigned int>(A.handle4()),
+ viennacl::cuda_arg<NumericT>(A.handle5()),
+ static_cast<unsigned int>(A.internal_size1()),
+ static_cast<unsigned int>(A.ell_nnz()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
+}
+
+
+
+/////////////////////////////////////
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_update_s_kernel(NumericT * s,
+ NumericT const * residual,
+ NumericT const * Ap,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int chunk_size,
+ unsigned int chunk_offset)
+{
+ NumericT alpha = 0;
+
+ // parallel reduction in work group to compute <r, r0> / <Ap, r0>
+ __shared__ NumericT shared_array[256];
+ __shared__ NumericT shared_array_Ap_in_r0[256];
+
+ shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
+ shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+ shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x + stride];
+ }
+ }
+
+ // compute alpha from reduced values:
+ __syncthreads();
+ alpha = shared_array[0] / shared_array_Ap_in_r0[0];
+
+ // run vector update and compute first stage of <s, s>
+ NumericT inner_prod_contrib = 0;
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ NumericT value_s = s[i];
+
+ value_s = residual[i] - alpha * Ap[i];
+ inner_prod_contrib += value_s * value_s;
+
+ s[i] = value_s;
+ }
+ __syncthreads();
+
+ // parallel reduction in work group
+ shared_array[threadIdx.x] = inner_prod_contrib;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+ }
+
+ // write results to inner_prod_buffer
+ if (threadIdx.x == 0)
+ inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+ vector_base<NumericT> & r,
+ vector_base<NumericT> const & Ap,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ unsigned int size = static_cast<unsigned int>(s.size());
+ unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
+ unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+ pipelined_bicgstab_update_s_kernel<<<256, 256>>>(viennacl::cuda_arg(s),
+ viennacl::cuda_arg(r),
+ viennacl::cuda_arg(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ chunk_size,
+ chunk_offset);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_update_s_kernel");
+}
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_vector_kernel(NumericT * result,
+ NumericT alpha,
+ NumericT * p,
+ NumericT omega,
+ NumericT const * s,
+ NumericT * residual,
+ NumericT const * As,
+ NumericT beta,
+ NumericT const * Ap,
+ NumericT const * r0star,
+ NumericT * inner_prod_buffer,
+ unsigned int size)
+{
+ NumericT inner_prod_r_r0star = 0;
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ NumericT value_result = result[i];
+ NumericT value_p = p[i];
+ NumericT value_s = s[i];
+ NumericT value_residual = residual[i];
+ NumericT value_As = As[i];
+ NumericT value_Ap = Ap[i];
+ NumericT value_r0star = r0star[i];
+
+ value_result += alpha * value_p + omega * value_s;
+ value_residual = value_s - omega * value_As;
+ value_p = value_residual + beta * (value_p - omega * value_Ap);
+
+ result[i] = value_result;
+ residual[i] = value_residual;
+ p[i] = value_p;
+ inner_prod_r_r0star += value_residual * value_r0star;
+ }
+
+ // parallel reduction in work group
+ __shared__ NumericT shared_array[256];
+ shared_array[threadIdx.x] = inner_prod_r_r0star;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0)
+ inner_prod_buffer[blockIdx.x] = shared_array[0];
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+ vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+ NumericT beta, vector_base<NumericT> const & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
+{
+ (void)buffer_chunk_size;
+ unsigned int size = static_cast<unsigned int>(result.size());
+
+ pipelined_bicgstab_vector_kernel<<<256, 256>>>(viennacl::cuda_arg(result),
+ alpha,
+ viennacl::cuda_arg(p),
+ omega,
+ viennacl::cuda_arg(s),
+ viennacl::cuda_arg(residual),
+ viennacl::cuda_arg(As),
+ beta,
+ viennacl::cuda_arg(Ap),
+ viennacl::cuda_arg(r0star),
+ viennacl::cuda_arg(inner_prod_buffer),
+ size);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_vector_kernel");
+}
+
+
+
+//
+// Compressed matrix
+//
+
+
+template<unsigned int SubWarpSizeV, typename NumericT>
+__global__ void pipelined_bicgstab_csr_vec_mul_blocked_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const NumericT * elements,
+ const NumericT * p,
+ NumericT * Ap,
+ const NumericT * r0star,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size,
+ unsigned int buffer_offset)
+{
+ __shared__ NumericT shared_elements[256];
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ NumericT inner_prod_r0Ap = 0;
+
+ const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
+ const unsigned int block_increment = blockDim.x * ((size - 1) / (gridDim.x * blockDim.x) + 1);
+ const unsigned int block_start = blockIdx.x * block_increment;
+ const unsigned int block_stop = min(block_start + block_increment, size);
+
+ for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
+ row < block_stop;
+ row += blockDim.x / SubWarpSizeV)
+ {
+ NumericT dot_prod = NumericT(0);
+ unsigned int row_end = row_indices[row+1];
+ for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
+ dot_prod += elements[i] * p[column_indices[i]];
+
+ shared_elements[threadIdx.x] = dot_prod;
+ if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
+ if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
+ if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
+ if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
+ if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
+
+ if (id_in_row == 0)
+ {
+ Ap[row] = shared_elements[threadIdx.x];
+ inner_prod_ApAp += shared_elements[threadIdx.x] * shared_elements[threadIdx.x];
+ inner_prod_pAp += p[row] * shared_elements[threadIdx.x];
+ inner_prod_r0Ap += r0star[row] * shared_elements[threadIdx.x];
+ }
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ __shared__ NumericT shared_array_r0Ap[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+ }
+
+}
+
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_csr_vec_mul_adaptive_kernel(
+ const unsigned int * row_indices,
+ const unsigned int * column_indices,
+ const unsigned int * row_blocks,
+ const NumericT * elements,
+ unsigned int num_blocks,
+ const NumericT * p,
+ NumericT * Ap,
+ const NumericT * r0star,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size,
+ unsigned int buffer_offset)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ NumericT inner_prod_r0Ap = 0;
+
+ __shared__ NumericT shared_elements[1024];
+
+ for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
+ {
+ unsigned int row_start = row_blocks[block_id];
+ unsigned int row_stop = row_blocks[block_id + 1];
+ unsigned int element_start = row_indices[row_start];
+ unsigned int element_stop = row_indices[row_stop];
+ unsigned int rows_to_process = row_stop - row_start;
+
+ if (rows_to_process > 1) // CSR stream with one thread per row
+ {
+ // load to shared buffer:
+ for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+ shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
+
+ __syncthreads();
+
+ // use one thread per row to sum:
+ for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
+ {
+ NumericT dot_prod = 0;
+ unsigned int thread_row_start = row_indices[row] - element_start;
+ unsigned int thread_row_stop = row_indices[row + 1] - element_start;
+ for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
+ dot_prod += shared_elements[i];
+ Ap[row] = dot_prod;
+ inner_prod_ApAp += dot_prod * dot_prod;
+ inner_prod_pAp += p[row] * dot_prod;
+ inner_prod_r0Ap += r0star[row] * dot_prod;
+ }
+ }
+ // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
+ else // CSR vector for a single row
+ {
+ // load and sum to shared buffer:
+ shared_elements[threadIdx.x] = 0;
+ for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
+ shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
+
+ // reduction to obtain final result
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
+ }
+
+ if (threadIdx.x == 0)
+ {
+ Ap[row_start] = shared_elements[0];
+ inner_prod_ApAp += shared_elements[0] * shared_elements[0];
+ inner_prod_pAp += p[row_start] * shared_elements[0];
+ inner_prod_r0Ap += r0star[row_start] * shared_elements[0];
+ }
+ }
+
+ __syncthreads(); // avoid race conditions
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ __shared__ NumericT shared_array_r0Ap[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+ }
+}
+
+
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
+ unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
+ unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
+ if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
+ {
+ pipelined_bicgstab_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+#else
+ if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
+ {
+ pipelined_bicgstab_csr_vec_mul_blocked_kernel<16, NumericT><<<256, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
+#endif
+ viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ viennacl::cuda_arg(r0star),
+ vec_size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ chunk_size,
+ chunk_offset
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
+ }
+ else
+ {
+ pipelined_bicgstab_csr_vec_mul_adaptive_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.blocks1()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ viennacl::cuda_arg(r0star),
+ vec_size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ chunk_size,
+ chunk_offset);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_csr_vec_mul_adaptive_kernel");
+ }
+}
+
+
+//
+// Coordinate Matrix
+//
+
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
+ const NumericT * elements,
+ const unsigned int * group_boundaries,
+ const NumericT * p,
+ NumericT * Ap,
+ const NumericT * r0star,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size,
+ unsigned int buffer_offset)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ NumericT inner_prod_r0Ap = 0;
+ __shared__ unsigned int shared_rows[128];
+ __shared__ NumericT inter_results[128];
+
+ uint2 tmp;
+ NumericT val;
+ unsigned int group_start = group_boundaries[blockIdx.x];
+ unsigned int group_end = group_boundaries[blockIdx.x + 1];
+ unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
+
+ unsigned int local_index = 0;
+
+ for (unsigned int k = 0; k < k_end; ++k)
+ {
+ local_index = group_start + k * blockDim.x + threadIdx.x;
+
+ tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
+ val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
+
+ //check for carry from previous loop run:
+ if (threadIdx.x == 0 && k > 0)
+ {
+ if (tmp.x == shared_rows[blockDim.x-1])
+ val += inter_results[blockDim.x-1];
+ else
+ {
+ NumericT Ap_entry = inter_results[blockDim.x-1];
+ Ap[shared_rows[blockDim.x-1]] = Ap_entry;
+ inner_prod_ApAp += Ap_entry * Ap_entry;
+ inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
+ inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
+ }
+ }
+
+ //segmented parallel reduction begin
+ __syncthreads();
+ shared_rows[threadIdx.x] = tmp.x;
+ inter_results[threadIdx.x] = val;
+ NumericT left = 0;
+ __syncthreads();
+
+ for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+ {
+ left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
+ __syncthreads();
+ inter_results[threadIdx.x] += left;
+ __syncthreads();
+ }
+ //segmented parallel reduction end
+
+ if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
+ shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
+ {
+ NumericT Ap_entry = inter_results[threadIdx.x];
+ Ap[tmp.x] = Ap_entry;
+ inner_prod_ApAp += Ap_entry * Ap_entry;
+ inner_prod_pAp += Ap_entry * p[tmp.x];
+ inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
+ }
+
+ __syncthreads();
+ } //for k
+
+ if (local_index + 1 == group_end)
+ {
+ NumericT Ap_entry = inter_results[threadIdx.x];
+ Ap[tmp.x] = Ap_entry;
+ inner_prod_ApAp += Ap_entry * Ap_entry;
+ inner_prod_pAp += Ap_entry * p[tmp.x];
+ inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ __shared__ NumericT shared_array_r0Ap[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+ }
+
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
+ unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
+ unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+ Ap.clear();
+
+ pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ viennacl::cuda_arg(r0star),
+ vec_size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ chunk_size,
+ chunk_offset);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_coo_vec_mul_kernel");
+}
+
+
+
+//
+// ELL Matrix
+//
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int * coords,
+ const NumericT * elements,
+ unsigned int internal_row_num,
+ unsigned int items_per_row,
+ const NumericT * p,
+ NumericT * Ap,
+ const NumericT * r0star,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size,
+ unsigned int buffer_offset)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ NumericT inner_prod_r0Ap = 0;
+ unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ for (unsigned int row = glb_id; row < size; row += glb_sz)
+ {
+ NumericT sum = 0;
+
+ unsigned int offset = row;
+ for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+ {
+ NumericT val = elements[offset];
+ sum += val ? p[coords[offset]] * val : NumericT(0);
+ }
+
+ Ap[row] = sum;
+ inner_prod_ApAp += sum * sum;
+ inner_prod_pAp += sum * p[row];
+ inner_prod_r0Ap += sum * r0star[row];
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ __shared__ NumericT shared_array_r0Ap[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+ }
+}
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
+ unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
+ unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+ pipelined_bicgstab_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.internal_size1()),
+ static_cast<unsigned int>(A.maxnnz()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ viennacl::cuda_arg(r0star),
+ vec_size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ chunk_size,
+ chunk_offset);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_ell_vec_mul_kernel");
+}
+
+
+//
+// SELL-C-\sigma Matrix
+//
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
+ const unsigned int * column_indices,
+ const unsigned int * block_start,
+ const NumericT * elements,
+ const NumericT * p,
+ NumericT * Ap,
+ const NumericT * r0star,
+ unsigned int size,
+ unsigned int block_size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size,
+ unsigned int buffer_offset)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ NumericT inner_prod_r0Ap = 0;
+
+ unsigned int blocks_per_threadblock = blockDim.x / block_size;
+ unsigned int id_in_block = threadIdx.x % block_size;
+ unsigned int num_blocks = (size - 1) / block_size + 1;
+ unsigned int global_warp_count = blocks_per_threadblock * gridDim.x;
+ unsigned int global_warp_id = blocks_per_threadblock * blockIdx.x + threadIdx.x / block_size;
+
+ for (unsigned int block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count)
+ {
+ unsigned int row = block_idx * block_size + id_in_block;
+ unsigned int offset = block_start[block_idx];
+ unsigned int num_columns = columns_per_block[block_idx];
+
+ NumericT sum = 0;
+ for (unsigned int item_id = 0; item_id < num_columns; item_id++)
+ {
+ unsigned int index = offset + item_id * block_size + id_in_block;
+ NumericT val = elements[index];
+
+ sum += val ? (p[column_indices[index]] * val) : 0;
+ }
+
+ if (row < size)
+ {
+ Ap[row] = sum;
+ inner_prod_ApAp += sum * sum;
+ inner_prod_pAp += sum * p[row];
+ inner_prod_r0Ap += sum * r0star[row];
+ }
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ __shared__ NumericT shared_array_r0Ap[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+ }
+}
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
+ unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
+ unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+ pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ viennacl::cuda_arg(r0star),
+ vec_size,
+ static_cast<unsigned int>(A.rows_per_block()),
+ viennacl::cuda_arg(inner_prod_buffer),
+ chunk_size,
+ chunk_offset);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_sliced_ell_vec_mul_kernel");
+}
+
+
+//
+// Hybrid Matrix
+//
+
+
+template<typename NumericT>
+__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int * ell_coords,
+ const NumericT * ell_elements,
+ const unsigned int * csr_rows,
+ const unsigned int * csr_cols,
+ const NumericT * csr_elements,
+ unsigned int internal_row_num,
+ unsigned int items_per_row,
+ const NumericT * p,
+ NumericT * Ap,
+ const NumericT * r0star,
+ unsigned int size,
+ NumericT * inner_prod_buffer,
+ unsigned int buffer_size,
+ unsigned int buffer_offset)
+{
+ NumericT inner_prod_ApAp = 0;
+ NumericT inner_prod_pAp = 0;
+ NumericT inner_prod_r0Ap = 0;
+ unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ for (unsigned int row = glb_id; row < size; row += glb_sz)
+ {
+ NumericT sum = 0;
+
+ unsigned int offset = row;
+ for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
+ {
+ NumericT val = ell_elements[offset];
+
+ sum += val ? p[ell_coords[offset]] * val : NumericT(0);
+ }
+
+ unsigned int col_begin = csr_rows[row];
+ unsigned int col_end = csr_rows[row + 1];
+
+ for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
+ {
+ sum += p[csr_cols[item_id]] * csr_elements[item_id];
+ }
+
+ Ap[row] = sum;
+ inner_prod_ApAp += sum * sum;
+ inner_prod_pAp += sum * p[row];
+ inner_prod_r0Ap += sum * r0star[row];
+ }
+
+ ////////// parallel reduction in work group
+ __shared__ NumericT shared_array_ApAp[256];
+ __shared__ NumericT shared_array_pAp[256];
+ __shared__ NumericT shared_array_r0Ap[256];
+ shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
+ shared_array_pAp[threadIdx.x] = inner_prod_pAp;
+ shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
+ shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
+ shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0) {
+ inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
+ inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
+ inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
+ }
+}
+
+
+
+template<typename NumericT>
+void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
+ unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
+ unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
+
+ pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg<unsigned int>(A.handle4()),
+ viennacl::cuda_arg<NumericT>(A.handle5()),
+ static_cast<unsigned int>(A.internal_size1()),
+ static_cast<unsigned int>(A.ell_nnz()),
+ viennacl::cuda_arg(p),
+ viennacl::cuda_arg(Ap),
+ viennacl::cuda_arg(r0star),
+ vec_size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ chunk_size,
+ chunk_offset);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_hyb_vec_mul_kernel");
+}
+
+//////////////////////////////////////////
+
+template <typename T>
+__global__ void pipelined_gmres_normalize_vk_kernel(T * vk,
+ unsigned int vk_offset,
+ T const * residual,
+ T * R_buffer,
+ unsigned int R_offset,
+ T const * inner_prod_buffer,
+ unsigned int chunk_size,
+ T * r_dot_vk_buffer,
+ unsigned int chunk_offset,
+ unsigned int size)
+{
+ __shared__ T shared_array[128];
+ T norm_vk = 0;
+
+ // parallel reduction in work group to compute <vk, vk>
+ shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+ }
+
+ // compute alpha from reduced values:
+ __syncthreads();
+ norm_vk = sqrt(shared_array[0]);
+
+ T inner_prod_contrib = 0;
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
+ T value_vk = vk[i + vk_offset] / norm_vk;
+
+ inner_prod_contrib += residual[i] * value_vk;
+
+ vk[i + vk_offset] = value_vk;
+ }
+ __syncthreads();
+
+ // parallel reduction in work group
+ shared_array[threadIdx.x] = inner_prod_contrib;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+ }
+
+ // write results of first reduction stage:
+ if (threadIdx.x == 0)
+ r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
+ // store norm:
+ if (blockDim.x * blockIdx.x + threadIdx.x == 0)
+ R_buffer[R_offset] = norm_vk;
+}
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+ *
+ * This routines computes for vectors 'r', 'v_k':
+ * Second reduction step for ||v_k||
+ * v_k /= ||v_k||
+ * First reduction step for <r, v_k>
+ */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+ vector_base<T> const & residual,
+ vector_base<T> & R_buffer,
+ vcl_size_t offset_in_R,
+ vector_base<T> const & inner_prod_buffer,
+ vector_base<T> & r_dot_vk_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ unsigned int vk_offset = viennacl::traits::start(v_k);
+ unsigned int R_offset = offset_in_R;
+ unsigned int chunk_size = buffer_chunk_size;
+ unsigned int chunk_offset = buffer_chunk_offset;
+ unsigned int size = v_k.size();
+
+ pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(viennacl::cuda_arg(v_k),
+ vk_offset,
+ viennacl::cuda_arg(residual),
+ viennacl::cuda_arg(R_buffer),
+ R_offset,
+ viennacl::cuda_arg(inner_prod_buffer),
+ chunk_size,
+ viennacl::cuda_arg(r_dot_vk_buffer),
+ chunk_offset,
+ size);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_normalize_vk_kernel");
+}
+
+
+
+template <typename T>
+__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const * krylov_basis,
+ unsigned int size,
+ unsigned int internal_size,
+ unsigned int k,
+ T * vi_in_vk_buffer,
+ unsigned int chunk_size)
+{
+ __shared__ T shared_array[7*128];
+ T value_vk = 0;
+
+ unsigned int k_base = 0;
+ while (k_base < k)
+ {
+ unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
+
+ for (unsigned int j=0; j<vecs_in_iteration; ++j)
+ shared_array[threadIdx.x + j*chunk_size] = 0;
+
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ value_vk = krylov_basis[i + k * internal_size];
+
+ for (unsigned int j=0; j<vecs_in_iteration; ++j)
+ shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size];
+ }
+
+ // parallel reduction in work group
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ for (unsigned int j=0; j<vecs_in_iteration; ++j)
+ shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
+ }
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0)
+ for (unsigned int j=0; j<vecs_in_iteration; ++j)
+ vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
+
+ k_base += vecs_in_iteration;
+ }
+
+}
+
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vcl_size_t param_k,
+ vector_base<T> & vi_in_vk_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ unsigned int chunk_size = buffer_chunk_size;
+ unsigned int size = v_k_size;
+ unsigned int internal_size = v_k_internal_size;
+ unsigned int k = param_k;
+
+ pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(viennacl::cuda_arg(device_krylov_basis),
+ size,
+ internal_size,
+ k,
+ viennacl::cuda_arg(vi_in_vk_buffer),
+ chunk_size);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage1_kernel");
+}
+
+
+
+
+template <typename T>
+__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T * krylov_basis,
+ unsigned int size,
+ unsigned int internal_size,
+ unsigned int k,
+ T const * vi_in_vk_buffer,
+ unsigned int chunk_size,
+ T * R_buffer,
+ unsigned int krylov_dim,
+ T * inner_prod_buffer)
+{
+ __shared__ T shared_array[7*128];
+ T vk_dot_vk = 0;
+ T value_vk = 0;
+
+ unsigned int k_base = 0;
+ while (k_base < k)
+ {
+ unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
+
+ // parallel reduction in work group for <v_i, v_k>
+ for (unsigned int j=0; j<vecs_in_iteration; ++j)
+ shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ for (unsigned int j=0; j<vecs_in_iteration; ++j)
+ shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
+ }
+ }
+ __syncthreads();
+
+ // v_k -= <v_i, v_k> v_i:
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ value_vk = krylov_basis[i + k * internal_size];
+
+ for (unsigned int j=0; j<vecs_in_iteration; ++j)
+ value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size];
+ vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
+ krylov_basis[i + k * internal_size] = value_vk;
+ }
+
+ // write to R: (to avoid thread divergence, all threads write the same value)
+ if (blockIdx.x == 0)
+ for (unsigned int j=0; j<vecs_in_iteration; ++j)
+ R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
+ __syncthreads();
+
+ k_base += vecs_in_iteration;
+ }
+
+ // parallel reduction in work group for <v_k, v_k>
+ shared_array[threadIdx.x] = vk_dot_vk;
+ for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
+ }
+
+ // write results to result array
+ if (threadIdx.x == 0)
+ inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
+}
+
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vcl_size_t param_k,
+ vector_base<T> const & vi_in_vk_buffer,
+ vector_base<T> & R_buffer,
+ vcl_size_t krylov_dim,
+ vector_base<T> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ unsigned int chunk_size = buffer_chunk_size;
+ unsigned int size = v_k_size;
+ unsigned int internal_size = v_k_internal_size;
+ unsigned int k = param_k;
+ unsigned int krylov = krylov_dim;
+
+ pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(viennacl::cuda_arg(device_krylov_basis),
+ size,
+ internal_size,
+ k,
+ viennacl::cuda_arg(vi_in_vk_buffer),
+ chunk_size,
+ viennacl::cuda_arg(R_buffer),
+ krylov,
+ viennacl::cuda_arg(inner_prod_buffer));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage2_kernel");
+}
+
+
+
+
+template <typename T>
+__global__ void pipelined_gmres_update_result_kernel(T * result,
+ T const * residual,
+ T const * krylov_basis,
+ unsigned int size,
+ unsigned int internal_size,
+ T const * coefficients,
+ unsigned int k)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ T value_result = result[i] + coefficients[0] * residual[i];
+
+ for (unsigned int j = 1; j < k; ++j)
+ value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size];
+
+ result[i] = value_result;
+ }
+}
+
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+ vector_base<T> const & residual,
+ vector_base<T> const & krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vector_base<T> const & coefficients,
+ vcl_size_t param_k)
+{
+ unsigned int size = v_k_size;
+ unsigned int internal_size = v_k_internal_size;
+ unsigned int k = param_k;
+
+ pipelined_gmres_update_result_kernel<<<128, 128>>>(viennacl::cuda_arg(result),
+ viennacl::cuda_arg(residual),
+ viennacl::cuda_arg(krylov_basis),
+ size,
+ internal_size,
+ viennacl::cuda_arg(coefficients),
+ k);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_update_result_kernel");
+}
+
+
+
+template <typename NumericT>
+void pipelined_gmres_prod(compressed_matrix<NumericT> const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 500
+ if (double(A.nnz()) / double(A.size1()) > 6.4) // less than 10% of threads expected to idle
+ {
+ pipelined_cg_csr_vec_mul_blocked_kernel<8, NumericT><<<256, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
+#else
+ if (double(A.nnz()) / double(A.size1()) > 12.0) // less than 25% of threads expected to idle
+ {
+ pipelined_cg_csr_vec_mul_blocked_kernel<16, NumericT><<<128, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
+#endif
+ viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ viennacl::cuda_arg(p) + viennacl::traits::start(p),
+ viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_blocked_kernel");
+ }
+ else
+ {
+ pipelined_cg_csr_vec_mul_adaptive_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.blocks1()),
+ viennacl::cuda_arg(p) + viennacl::traits::start(p),
+ viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_adaptive_kernel");
+ }
+
+}
+
+template <typename T>
+void pipelined_gmres_prod(coordinate_matrix<T> const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+ Ap.clear();
+
+ pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle12()),
+ viennacl::cuda_arg<T>(A.handle()),
+ viennacl::cuda_arg<unsigned int>(A.handle3()),
+ viennacl::cuda_arg(p) + viennacl::traits::start(p),
+ viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
+}
+
+template <typename T>
+void pipelined_gmres_prod(ell_matrix<T> const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
+
+ pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<T>(A.handle()),
+ static_cast<unsigned int>(A.internal_size1()),
+ static_cast<unsigned int>(A.maxnnz()),
+ viennacl::cuda_arg(p) + viennacl::traits::start(p),
+ viennacl::cuda_arg(Ap) + viennacl::traits::start(Ap),
+ size,
+ viennacl::cuda_arg(inner_prod_buffer),
+ buffer_size_per_vector);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
+}
+
+template <typename T>
+void pipelined_gmres_prod(sliced_ell_matrix<T> const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ unsigned int size = p.size();
+ unsigned int buffe
<TRUNCATED>
[29/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp
new file mode 100644
index 0000000..6ac8e09
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/spgemm_rmerge.hpp
@@ -0,0 +1,669 @@
+#ifndef VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_
+#define VIENNACL_LINALG_CUDA_SPGEMM_RMERGE_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp
+ @brief Implementations of operations using sparse matrices using CUDA
+*/
+
+#include <stdexcept>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/tools/timer.hpp"
+
+#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Loads a value from the specified address. With CUDA arch 3.5 and above the value is also stored in global constant memory for later reuse */
+template<typename NumericT>
+static inline __device__ NumericT load_and_cache(const NumericT *address)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+ return __ldg(address);
+#else
+ return *address;
+#endif
+}
+
+
+//
+// Stage 1: Obtain upper bound for number of elements per row in C:
+//
+template<typename IndexT>
+__device__ IndexT round_to_next_power_of_2(IndexT val)
+{
+ if (val > 32)
+ return 64; // just to indicate that we need to split/factor the matrix!
+ else if (val > 16)
+ return 32;
+ else if (val > 8)
+ return 16;
+ else if (val > 4)
+ return 8;
+ else if (val > 2)
+ return 4;
+ else if (val > 1)
+ return 2;
+ else
+ return 1;
+}
+
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_stage_1(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ IndexT A_size1,
+ const IndexT * B_row_indices,
+ IndexT *subwarpsize_per_group,
+ IndexT *max_nnz_row_A_per_group,
+ IndexT *max_nnz_row_B_per_group)
+{
+ unsigned int subwarpsize_in_thread = 0;
+ unsigned int max_nnz_row_A = 0;
+ unsigned int max_nnz_row_B = 0;
+
+ unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+ unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+ for (unsigned int row = rows_per_group * blockIdx.x + threadIdx.x; row < row_per_group_end; row += blockDim.x)
+ {
+ unsigned int A_row_start = A_row_indices[row];
+ unsigned int A_row_end = A_row_indices[row+1];
+ unsigned int row_num = A_row_end - A_row_start;
+ subwarpsize_in_thread = max(A_row_end - A_row_start, subwarpsize_in_thread);
+ max_nnz_row_A = max(max_nnz_row_A, row_num);
+ for (unsigned int j = A_row_start; j < A_row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ unsigned int row_len_B = B_row_indices[col + 1] - B_row_indices[col];
+ max_nnz_row_B = max(row_len_B, max_nnz_row_B);
+ }
+ }
+
+ // reduction to obtain maximum in thread block
+ __shared__ unsigned int shared_subwarpsize[256];
+ __shared__ unsigned int shared_max_nnz_row_A[256];
+ __shared__ unsigned int shared_max_nnz_row_B[256];
+
+ shared_subwarpsize[threadIdx.x] = subwarpsize_in_thread;
+ shared_max_nnz_row_A[threadIdx.x] = max_nnz_row_A;
+ shared_max_nnz_row_B[threadIdx.x] = max_nnz_row_B;
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ shared_subwarpsize[threadIdx.x] = max( shared_subwarpsize[threadIdx.x], shared_subwarpsize[threadIdx.x + stride]);
+ shared_max_nnz_row_A[threadIdx.x] = max(shared_max_nnz_row_A[threadIdx.x], shared_max_nnz_row_A[threadIdx.x + stride]);
+ shared_max_nnz_row_B[threadIdx.x] = max(shared_max_nnz_row_B[threadIdx.x], shared_max_nnz_row_B[threadIdx.x + stride]);
+ }
+ }
+
+ if (threadIdx.x == 0)
+ {
+ subwarpsize_per_group[blockIdx.x] = round_to_next_power_of_2(shared_subwarpsize[0]);
+ max_nnz_row_A_per_group[blockIdx.x] = shared_max_nnz_row_A[0];
+ max_nnz_row_B_per_group[blockIdx.x] = shared_max_nnz_row_B[0];
+ }
+}
+
+//
+// Stage 2: Determine sparsity pattern of C
+//
+
+// Using warp shuffle routines (CUDA arch 3.5)
+template<unsigned int SubWarpSizeV, typename IndexT>
+__device__ IndexT subwarp_minimum_shuffle(IndexT min_index)
+{
+ for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
+ min_index = min(min_index, __shfl_xor((int)min_index, (int)i));
+ return min_index;
+}
+
+// Using shared memory
+template<unsigned int SubWarpSizeV, typename IndexT>
+__device__ IndexT subwarp_minimum_shared(IndexT min_index, IndexT id_in_warp, IndexT *shared_buffer)
+{
+ shared_buffer[threadIdx.x] = min_index;
+ for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
+ shared_buffer[threadIdx.x] = min(shared_buffer[threadIdx.x], shared_buffer[(threadIdx.x + i) % 512]);
+ return shared_buffer[threadIdx.x - id_in_warp];
+}
+
+
+template<unsigned int SubWarpSizeV, typename IndexT>
+__global__ void compressed_matrix_gemm_stage_2(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ IndexT A_size1,
+ const IndexT * B_row_indices,
+ const IndexT * B_col_indices,
+ IndexT B_size2,
+ IndexT * C_row_indices)
+{
+ __shared__ unsigned int shared_buffer[512];
+
+ unsigned int num_warps = blockDim.x / SubWarpSizeV;
+ unsigned int warp_id = threadIdx.x / SubWarpSizeV;
+ unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
+
+ unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+ unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+ for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
+ {
+ unsigned int row_A_start = A_row_indices[row];
+ unsigned int row_A_end = A_row_indices[row+1];
+
+ unsigned int my_row_B = row_A_start + id_in_warp;
+ unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+ unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+ unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+
+ unsigned int num_nnz = 0;
+ if (row_A_end - row_A_start > 1) // zero or no row can be processed faster
+ {
+ unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+
+ while (1)
+ {
+ // determine current minimum (warp shuffle)
+ unsigned int min_index = current_front_index;
+ min_index = subwarp_minimum_shared<SubWarpSizeV>(min_index, id_in_warp, shared_buffer);
+
+ if (min_index == B_size2)
+ break;
+
+ // update front:
+ if (current_front_index == min_index)
+ {
+ ++row_B_start;
+ current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+ }
+
+ ++num_nnz;
+ }
+ }
+ else
+ {
+ num_nnz = row_B_end - row_B_start;
+ }
+
+ if (id_in_warp == 0)
+ C_row_indices[row] = num_nnz;
+ }
+
+}
+
+
+//
+// Stage 3: Fill C with values
+//
+
+// Using warp shuffle routines (CUDA arch 3.5)
+template<unsigned int SubWarpSizeV, typename NumericT>
+__device__ NumericT subwarp_accumulate_shuffle(NumericT output_value)
+{
+ for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
+ output_value += __shfl_xor((int)output_value, (int)i);
+ return output_value;
+}
+
+// Using shared memory
+template<unsigned int SubWarpSizeV, typename NumericT>
+__device__ NumericT subwarp_accumulate_shared(NumericT output_value, unsigned int id_in_warp, NumericT *shared_buffer)
+{
+ shared_buffer[threadIdx.x] = output_value;
+ for (unsigned int i = SubWarpSizeV/2; i >= 1; i /= 2)
+ shared_buffer[threadIdx.x] += shared_buffer[(threadIdx.x + i) % 512];
+ return shared_buffer[threadIdx.x - id_in_warp];
+}
+
+
+template<unsigned int SubWarpSizeV, typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_stage_3(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ const NumericT * A_elements,
+ IndexT A_size1,
+ const IndexT * B_row_indices,
+ const IndexT * B_col_indices,
+ const NumericT * B_elements,
+ IndexT B_size2,
+ IndexT const * C_row_indices,
+ IndexT * C_col_indices,
+ NumericT * C_elements)
+{
+ __shared__ unsigned int shared_indices[512];
+ __shared__ NumericT shared_values[512];
+
+ unsigned int num_warps = blockDim.x / SubWarpSizeV;
+ unsigned int warp_id = threadIdx.x / SubWarpSizeV;
+ unsigned int id_in_warp = threadIdx.x % SubWarpSizeV;
+
+ unsigned int rows_per_group = (A_size1 - 1) / gridDim.x + 1;
+ unsigned int row_per_group_end = min(A_size1, rows_per_group * (blockIdx.x + 1));
+
+ for (unsigned int row = rows_per_group * blockIdx.x + warp_id; row < row_per_group_end; row += num_warps)
+ {
+ unsigned int row_A_start = A_row_indices[row];
+ unsigned int row_A_end = A_row_indices[row+1];
+
+ unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? id_in_warp : 0); // special case: single row
+ unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0;
+ unsigned int row_B_start = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index) : 0;
+ unsigned int row_B_end = (my_row_B < row_A_end) ? load_and_cache(B_row_indices + row_B_index + 1) : 0;
+ NumericT val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0;
+
+ unsigned int index_in_C = C_row_indices[row];
+
+ if (row_A_end - row_A_start > 1)
+ {
+ unsigned int current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+ NumericT current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements + row_B_start) : 0;
+
+ unsigned int index_buffer = 0;
+ NumericT value_buffer = 0;
+ unsigned int buffer_size = 0;
+ while (1)
+ {
+ // determine current minimum:
+ unsigned int min_index = subwarp_minimum_shared<SubWarpSizeV>(current_front_index, id_in_warp, shared_indices);
+
+ if (min_index == B_size2) // done
+ break;
+
+ // compute entry in C:
+ NumericT output_value = (current_front_index == min_index) ? val_A * current_front_value : 0;
+ output_value = subwarp_accumulate_shared<SubWarpSizeV>(output_value, id_in_warp, shared_values);
+
+ // update front:
+ if (current_front_index == min_index)
+ {
+ ++row_B_start;
+ current_front_index = (row_B_start < row_B_end) ? load_and_cache(B_col_indices + row_B_start) : B_size2;
+ current_front_value = (row_B_start < row_B_end) ? load_and_cache(B_elements + row_B_start) : 0;
+ }
+
+ // write current front to register buffer:
+ index_buffer = (id_in_warp == buffer_size) ? min_index : index_buffer;
+ value_buffer = (id_in_warp == buffer_size) ? output_value : value_buffer;
+ ++buffer_size;
+
+ // flush register buffer via a coalesced write once full:
+ if (buffer_size == SubWarpSizeV)
+ {
+ C_col_indices[index_in_C + id_in_warp] = index_buffer;
+ C_elements[index_in_C + id_in_warp] = value_buffer;
+ }
+
+ index_in_C += (buffer_size == SubWarpSizeV) ? SubWarpSizeV : 0;
+ buffer_size = (buffer_size == SubWarpSizeV) ? 0 : buffer_size;
+ }
+
+ // write remaining entries in register buffer to C:
+ if (id_in_warp < buffer_size)
+ {
+ C_col_indices[index_in_C + id_in_warp] = index_buffer;
+ C_elements[index_in_C + id_in_warp] = value_buffer;
+ }
+ }
+ else // write respective row using the full subwarp:
+ {
+ for (unsigned int i = row_B_start + id_in_warp; i < row_B_end; i += SubWarpSizeV)
+ {
+ C_col_indices[index_in_C + id_in_warp] = load_and_cache(B_col_indices + i);
+ C_elements[index_in_C + id_in_warp] = val_A * load_and_cache(B_elements + i);
+ index_in_C += SubWarpSizeV;
+ }
+ }
+
+ }
+
+}
+
+
+
+//
+// Decomposition kernels:
+//
+template<typename IndexT>
+__global__ void compressed_matrix_gemm_decompose_1(
+ const IndexT * A_row_indices,
+ IndexT A_size1,
+ IndexT max_per_row,
+ IndexT *chunks_per_row)
+{
+ for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
+ {
+ IndexT num_entries = A_row_indices[i+1] - A_row_indices[i];
+ chunks_per_row[i] = (num_entries < max_per_row) ? 1 : ((num_entries - 1)/ max_per_row + 1);
+ }
+}
+
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_A2(
+ IndexT * A2_row_indices,
+ IndexT * A2_col_indices,
+ NumericT * A2_elements,
+ IndexT A2_size1,
+ IndexT *new_row_buffer)
+{
+ for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A2_size1; i += blockDim.x * gridDim.x)
+ {
+ unsigned int index_start = new_row_buffer[i];
+ unsigned int index_stop = new_row_buffer[i+1];
+
+ A2_row_indices[i] = index_start;
+
+ for (IndexT j = index_start; j < index_stop; ++j)
+ {
+ A2_col_indices[j] = j;
+ A2_elements[j] = NumericT(1);
+ }
+ }
+
+ // write last entry in row_buffer with global thread 0:
+ if (threadIdx.x == 0 && blockIdx.x == 0)
+ A2_row_indices[A2_size1] = new_row_buffer[A2_size1];
+}
+
+template<typename IndexT, typename NumericT>
+__global__ void compressed_matrix_gemm_G1(
+ IndexT * G1_row_indices,
+ IndexT * G1_col_indices,
+ NumericT * G1_elements,
+ IndexT G1_size1,
+ IndexT const *A_row_indices,
+ IndexT const *A_col_indices,
+ NumericT const *A_elements,
+ IndexT A_size1,
+ IndexT A_nnz,
+ IndexT max_per_row,
+ IndexT *new_row_buffer)
+{
+ // Part 1: Copy column indices and entries:
+ for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_nnz; i += blockDim.x * gridDim.x)
+ {
+ G1_col_indices[i] = A_col_indices[i];
+ G1_elements[i] = A_elements[i];
+ }
+
+ // Part 2: Derive new row indicies:
+ for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < A_size1; i += blockDim.x * gridDim.x)
+ {
+ unsigned int old_start = A_row_indices[i];
+ unsigned int new_start = new_row_buffer[i];
+ unsigned int row_chunks = new_row_buffer[i+1] - new_start;
+
+ for (IndexT j=0; j<row_chunks; ++j)
+ G1_row_indices[new_start + j] = old_start + j * max_per_row;
+ }
+
+ // write last entry in row_buffer with global thread 0:
+ if (threadIdx.x == 0 && blockIdx.x == 0)
+ G1_row_indices[G1_size1] = A_row_indices[A_size1];
+}
+
+
+
+/** @brief Carries out sparse_matrix-sparse_matrix multiplication for CSR matrices
+*
+* Implementation of the convenience expression C = prod(A, B);
+* Based on computing C(i, :) = A(i, :) * B via merging the respective rows of B
+*
+* @param A Left factor
+* @param B Right factor
+* @param C Result matrix
+*/
+template<class NumericT, unsigned int AlignmentV>
+void prod_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ viennacl::compressed_matrix<NumericT, AlignmentV> const & B,
+ viennacl::compressed_matrix<NumericT, AlignmentV> & C)
+{
+ C.resize(A.size1(), B.size2(), false);
+
+ unsigned int blocknum = 256;
+ unsigned int threadnum = 128;
+
+ viennacl::vector<unsigned int> subwarp_sizes(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+ viennacl::vector<unsigned int> max_nnz_row_A(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+ viennacl::vector<unsigned int> max_nnz_row_B(blocknum, viennacl::traits::context(A)); // upper bound for the nonzeros per row encountered for each work group
+
+ //
+ // Stage 1: Determine upper bound for number of nonzeros
+ //
+ compressed_matrix_gemm_stage_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg(subwarp_sizes),
+ viennacl::cuda_arg(max_nnz_row_A),
+ viennacl::cuda_arg(max_nnz_row_B)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_1");
+
+ subwarp_sizes.switch_memory_context(viennacl::context(MAIN_MEMORY));
+ unsigned int * subwarp_sizes_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(subwarp_sizes.handle());
+
+ max_nnz_row_A.switch_memory_context(viennacl::context(MAIN_MEMORY));
+ unsigned int const * max_nnz_row_A_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_A.handle());
+
+ max_nnz_row_B.switch_memory_context(viennacl::context(MAIN_MEMORY));
+ unsigned int const * max_nnz_row_B_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(max_nnz_row_B.handle());
+
+ unsigned int max_subwarp_size = 0;
+ //std::cout << "Scratchpad offsets: " << std::endl;
+ for (std::size_t i=0; i<subwarp_sizes.size(); ++i)
+ max_subwarp_size = std::max(max_subwarp_size, subwarp_sizes_ptr[i]);
+ unsigned int A_max_nnz_per_row = 0;
+ for (std::size_t i=0; i<max_nnz_row_A.size(); ++i)
+ A_max_nnz_per_row = std::max(A_max_nnz_per_row, max_nnz_row_A_ptr[i]);
+
+ if (max_subwarp_size > 32)
+ {
+ // determine augmented size:
+ unsigned int max_entries_in_G = 32;
+ if (A_max_nnz_per_row <= 256)
+ max_entries_in_G = 16;
+ if (A_max_nnz_per_row <= 64)
+ max_entries_in_G = 8;
+
+ viennacl::vector<unsigned int> exclusive_scan_helper(A.size1() + 1, viennacl::traits::context(A));
+ compressed_matrix_gemm_decompose_1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ static_cast<unsigned int>(A.size1()),
+ static_cast<unsigned int>(max_entries_in_G),
+ viennacl::cuda_arg(exclusive_scan_helper)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_decompose_1");
+
+ viennacl::linalg::exclusive_scan(exclusive_scan_helper);
+ unsigned int augmented_size = exclusive_scan_helper[A.size1()];
+
+ // split A = A2 * G1
+ viennacl::compressed_matrix<NumericT, AlignmentV> A2(A.size1(), augmented_size, augmented_size, viennacl::traits::context(A));
+ viennacl::compressed_matrix<NumericT, AlignmentV> G1(augmented_size, A.size2(), A.nnz(), viennacl::traits::context(A));
+
+ // fill A2:
+ compressed_matrix_gemm_A2<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A2.handle1()),
+ viennacl::cuda_arg<unsigned int>(A2.handle2()),
+ viennacl::cuda_arg<NumericT>(A2.handle()),
+ static_cast<unsigned int>(A2.size1()),
+ viennacl::cuda_arg(exclusive_scan_helper)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_A2");
+
+ // fill G1:
+ compressed_matrix_gemm_G1<<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(G1.handle1()),
+ viennacl::cuda_arg<unsigned int>(G1.handle2()),
+ viennacl::cuda_arg<NumericT>(G1.handle()),
+ static_cast<unsigned int>(G1.size1()),
+ viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ static_cast<unsigned int>(A.nnz()),
+ static_cast<unsigned int>(max_entries_in_G),
+ viennacl::cuda_arg(exclusive_scan_helper)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_G1");
+
+ // compute tmp = G1 * B;
+ // C = A2 * tmp;
+ viennacl::compressed_matrix<NumericT, AlignmentV> tmp(G1.size1(), B.size2(), 0, viennacl::traits::context(A));
+ prod_impl(G1, B, tmp); // this runs a standard RMerge without decomposition of G1
+ prod_impl(A2, tmp, C); // this may split A2 again
+ return;
+ }
+
+ //std::cout << "Running RMerge with subwarp size " << max_subwarp_size << std::endl;
+
+ subwarp_sizes.switch_memory_context(viennacl::traits::context(A));
+ max_nnz_row_A.switch_memory_context(viennacl::traits::context(A));
+ max_nnz_row_B.switch_memory_context(viennacl::traits::context(A));
+
+ //
+ // Stage 2: Determine pattern of C
+ //
+
+ if (max_subwarp_size == 32)
+ {
+ compressed_matrix_gemm_stage_2<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg<unsigned int>(B.handle2()),
+ static_cast<unsigned int>(B.size2()),
+ viennacl::cuda_arg<unsigned int>(C.handle1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
+ }
+ else if (max_subwarp_size == 16)
+ {
+ compressed_matrix_gemm_stage_2<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg<unsigned int>(B.handle2()),
+ static_cast<unsigned int>(B.size2()),
+ viennacl::cuda_arg<unsigned int>(C.handle1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
+ }
+ else
+ {
+ compressed_matrix_gemm_stage_2<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg<unsigned int>(B.handle2()),
+ static_cast<unsigned int>(B.size2()),
+ viennacl::cuda_arg<unsigned int>(C.handle1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_2");
+ }
+
+ // exclusive scan on C.handle1(), ultimately allowing to allocate remaining memory for C
+ viennacl::backend::typesafe_host_array<unsigned int> row_buffer(C.handle1(), C.size1() + 1);
+ viennacl::backend::memory_read(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+ unsigned int current_offset = 0;
+ for (std::size_t i=0; i<C.size1(); ++i)
+ {
+ unsigned int tmp = row_buffer[i];
+ row_buffer.set(i, current_offset);
+ current_offset += tmp;
+ }
+ row_buffer.set(C.size1(), current_offset);
+ viennacl::backend::memory_write(C.handle1(), 0, row_buffer.raw_size(), row_buffer.get());
+
+
+ //
+ // Stage 3: Compute entries in C
+ //
+ C.reserve(current_offset, false);
+
+ if (max_subwarp_size == 32)
+ {
+ compressed_matrix_gemm_stage_3<32><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg<unsigned int>(B.handle2()),
+ viennacl::cuda_arg<NumericT>(B.handle()),
+ static_cast<unsigned int>(B.size2()),
+ viennacl::cuda_arg<unsigned int>(C.handle1()),
+ viennacl::cuda_arg<unsigned int>(C.handle2()),
+ viennacl::cuda_arg<NumericT>(C.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
+ }
+ else if (max_subwarp_size == 16)
+ {
+ compressed_matrix_gemm_stage_3<16><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg<unsigned int>(B.handle2()),
+ viennacl::cuda_arg<NumericT>(B.handle()),
+ static_cast<unsigned int>(B.size2()),
+ viennacl::cuda_arg<unsigned int>(C.handle1()),
+ viennacl::cuda_arg<unsigned int>(C.handle2()),
+ viennacl::cuda_arg<NumericT>(C.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
+ }
+ else
+ {
+ compressed_matrix_gemm_stage_3<8><<<blocknum, threadnum>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(B.handle1()),
+ viennacl::cuda_arg<unsigned int>(B.handle2()),
+ viennacl::cuda_arg<NumericT>(B.handle()),
+ static_cast<unsigned int>(B.size2()),
+ viennacl::cuda_arg<unsigned int>(C.handle1()),
+ viennacl::cuda_arg<unsigned int>(C.handle2()),
+ viennacl::cuda_arg<NumericT>(C.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_gemm_stage_3");
+ }
+
+}
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[15/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/matrix_operations.hpp
new file mode 100644
index 0000000..c9dec88
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/matrix_operations.hpp
@@ -0,0 +1,1303 @@
+#ifndef VIENNACL_LINALG_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/matrix_operations.hpp
+ @brief Implementations of dense matrix related operations including matrix-vector products.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/linalg/host_based/matrix_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+ #include "viennacl/linalg/opencl/matrix_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ #include "viennacl/linalg/cuda/matrix_operations.hpp"
+#endif
+
+namespace viennacl
+{
+ namespace linalg
+ {
+
+ template<typename DestNumericT, typename SrcNumericT>
+ void convert(matrix_base<DestNumericT> & dest, matrix_base<SrcNumericT> const & src)
+ {
+ assert(viennacl::traits::size1(dest) == viennacl::traits::size1(src) && bool("Incompatible matrix sizes in m1 = m2 (convert): size1(m1) != size1(m2)"));
+ assert(viennacl::traits::size2(dest) == viennacl::traits::size2(src) && bool("Incompatible matrix sizes in m1 = m2 (convert): size2(m1) != size2(m2)"));
+
+ switch (viennacl::traits::handle(dest).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::convert(dest, src);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::convert(dest, src);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::convert(dest, src);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+ template<typename NumericT,
+ typename SizeT, typename DistanceT>
+ void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy,
+ matrix_base<NumericT> & temp_trans)
+ {
+ switch (viennacl::traits::handle(proxy).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::trans(proxy, temp_trans);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::trans(proxy,temp_trans);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::trans(proxy,temp_trans);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ template<typename NumericT,
+ typename ScalarType1>
+ void am(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+ {
+ switch (viennacl::traits::handle(mat1).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::am(mat1, mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ template<typename NumericT,
+ typename ScalarType1, typename ScalarType2>
+ void ambm(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+ {
+ switch (viennacl::traits::handle(mat1).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::ambm(mat1,
+ mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ mat3, beta, len_beta, reciprocal_beta, flip_sign_beta);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::ambm(mat1,
+ mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ mat3, beta, len_beta, reciprocal_beta, flip_sign_beta);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::ambm(mat1,
+ mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ mat3, beta, len_beta, reciprocal_beta, flip_sign_beta);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ template<typename NumericT,
+ typename ScalarType1, typename ScalarType2>
+ void ambm_m(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+ {
+ switch (viennacl::traits::handle(mat1).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::ambm_m(mat1,
+ mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ mat3, beta, len_beta, reciprocal_beta, flip_sign_beta);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::ambm_m(mat1,
+ mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ mat3, beta, len_beta, reciprocal_beta, flip_sign_beta);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::ambm_m(mat1,
+ mat2, alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ mat3, beta, len_beta, reciprocal_beta, flip_sign_beta);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ template<typename NumericT>
+ void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+ {
+ switch (viennacl::traits::handle(mat).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::matrix_assign(mat, s, clear);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::matrix_assign(mat, s, clear);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::matrix_assign(mat, s, clear);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ template<typename NumericT>
+ void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+ {
+ switch (viennacl::traits::handle(mat).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::matrix_diagonal_assign(mat, s);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::matrix_diagonal_assign(mat, s);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::matrix_diagonal_assign(mat, s);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ /** @brief Dispatcher interface for A = diag(v, k) */
+ template<typename NumericT>
+ void matrix_diag_from_vector(const vector_base<NumericT> & v, int k, matrix_base<NumericT> & A)
+ {
+ switch (viennacl::traits::handle(v).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::matrix_diag_from_vector(v, k, A);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::matrix_diag_from_vector(v, k, A);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::matrix_diag_from_vector(v, k, A);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+ /** @brief Dispatcher interface for v = diag(A, k) */
+ template<typename NumericT>
+ void matrix_diag_to_vector(const matrix_base<NumericT> & A, int k, vector_base<NumericT> & v)
+ {
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::matrix_diag_to_vector(A, k, v);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::matrix_diag_to_vector(A, k, v);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::matrix_diag_to_vector(A, k, v);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+ template<typename NumericT>
+ void matrix_row(const matrix_base<NumericT> & A, unsigned int i, vector_base<NumericT> & v)
+ {
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::matrix_row(A, i, v);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::matrix_row(A, i, v);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::matrix_row(A, i, v);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+ template<typename NumericT>
+ void matrix_column(const matrix_base<NumericT> & A, unsigned int j, vector_base<NumericT> & v)
+ {
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::matrix_column(A, j, v);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::matrix_column(A, j, v);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::matrix_column(A, j, v);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+ /** @brief Computes the Frobenius norm of a matrix - dispatcher interface
+ *
+ * @param A The matrix
+ * @param result The result scalar
+ *
+ * Note that if A is strided or off-set, then a copy will be created.
+ */
+ template<typename T>
+ void norm_frobenius_impl(matrix_base<T> const & A,
+ scalar<T> & result)
+ {
+ typedef typename matrix_base<T>::handle_type HandleType;
+
+ if ((A.start1() > 0) || (A.start2() > 0) || (A.stride1() > 1) || (A.stride2() > 1)) {
+ if (A.row_major()) {
+ viennacl::matrix<T, viennacl::row_major> temp_A(A);
+ viennacl::vector_base<T> temp(const_cast<HandleType &>(temp_A.handle()), temp_A.internal_size(), 0, 1);
+ norm_2_impl(temp, result);
+ } else {
+ viennacl::matrix<T, viennacl::column_major> temp_A(A);
+ viennacl::vector_base<T> temp(const_cast<HandleType &>(temp_A.handle()), temp_A.internal_size(), 0, 1);
+ norm_2_impl(temp, result);
+ }
+ } else {
+ viennacl::vector_base<T> temp(const_cast<HandleType &>(A.handle()), A.internal_size(), 0, 1);
+ norm_2_impl(temp, result);
+ }
+
+ }
+
+ /** @brief Computes the Frobenius norm of a vector with final reduction on the CPU
+ *
+ * @param A The matrix
+ * @param result The result scalar
+ *
+ * Note that if A is strided or off-set, then a copy will be created.
+ */
+ template<typename T>
+ void norm_frobenius_cpu(matrix_base<T> const & A,
+ T & result)
+ {
+ typedef typename matrix_base<T>::handle_type HandleType;
+
+ if ((A.start1() > 0) || (A.start2() > 0) || (A.stride1() > 1) || (A.stride2() > 1)) {
+ if (A.row_major()) {
+ viennacl::matrix<T, viennacl::row_major> temp_A(A);
+ viennacl::vector_base<T> temp(const_cast<HandleType &>(temp_A.handle()), temp_A.internal_size(), 0, 1);
+ norm_2_cpu(temp, result);
+ } else {
+ viennacl::matrix<T, viennacl::column_major> temp_A(A);
+ viennacl::vector_base<T> temp(const_cast<HandleType &>(temp_A.handle()), temp_A.internal_size(), 0, 1);
+ norm_2_cpu(temp, result);
+ }
+ } else {
+ viennacl::vector_base<T> temp(const_cast<HandleType &>(A.handle()), A.internal_size(), 0, 1);
+ norm_2_cpu(temp, result);
+ }
+
+ }
+
+ //
+ ///////////////////////// matrix-vector products /////////////////////////////////
+ //
+
+
+
+ // A * x
+
+ /** @brief Carries out matrix-vector multiplication
+ *
+ * Implementation of the convenience expression result = prod(mat, vec);
+ *
+ * @param mat The matrix
+ * @param vec The vector
+ * @param result The result vector
+ */
+ template<typename NumericT>
+ void prod_impl(const matrix_base<NumericT> & mat,
+ const vector_base<NumericT> & vec,
+ vector_base<NumericT> & result)
+ {
+ assert( (viennacl::traits::size1(mat) == viennacl::traits::size(result)) && bool("Size check failed at v1 = prod(A, v2): size1(A) != size(v1)"));
+ assert( (viennacl::traits::size2(mat) == viennacl::traits::size(vec)) && bool("Size check failed at v1 = prod(A, v2): size2(A) != size(v2)"));
+
+ switch (viennacl::traits::handle(mat).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::prod_impl(mat, false, vec, result);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::prod_impl(mat, false, vec, result);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::prod_impl(mat, false, vec, result);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ // trans(A) * x
+
+ /** @brief Carries out matrix-vector multiplication with a transposed matrix
+ *
+ * Implementation of the convenience expression result = trans(mat) * vec;
+ *
+ * @param mat_trans The transposed matrix proxy
+ * @param vec The vector
+ * @param result The result vector
+ */
+ template<typename NumericT>
+ void prod_impl(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & mat_trans,
+ const vector_base<NumericT> & vec,
+ vector_base<NumericT> & result)
+ {
+ assert( (viennacl::traits::size1(mat_trans.lhs()) == viennacl::traits::size(vec)) && bool("Size check failed at v1 = trans(A) * v2: size1(A) != size(v2)"));
+ assert( (viennacl::traits::size2(mat_trans.lhs()) == viennacl::traits::size(result)) && bool("Size check failed at v1 = trans(A) * v2: size2(A) != size(v1)"));
+
+ switch (viennacl::traits::handle(mat_trans.lhs()).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::prod_impl(mat_trans.lhs(), true, vec, result);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::prod_impl(mat_trans.lhs(), true, vec, result);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::prod_impl(mat_trans.lhs(), true, vec, result);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ //
+ ///////////////////////// matrix-matrix products /////////////////////////////////
+ //
+
+ /** @brief Carries out matrix-matrix multiplication
+ *
+ * Implementation of C = prod(A, B);
+ *
+ */
+ template<typename NumericT, typename ScalarType >
+ void prod_impl(const matrix_base<NumericT> & A,
+ const matrix_base<NumericT> & B,
+ matrix_base<NumericT> & C,
+ ScalarType alpha,
+ ScalarType beta)
+ {
+ assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size check failed at C = prod(A, B): size1(A) != size1(C)"));
+ assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size check failed at C = prod(A, B): size2(A) != size1(B)"));
+ assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size check failed at C = prod(A, B): size2(B) != size2(C)"));
+
+
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::prod_impl(A, false, B, false, C, alpha, beta);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::prod_impl(A, false, B, false, C, alpha, beta);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::prod_impl(A, false, B, false, C, alpha, beta);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+
+ /** @brief Carries out matrix-matrix multiplication
+ *
+ * Implementation of C = prod(trans(A), B);
+ *
+ */
+ template<typename NumericT, typename ScalarType >
+ void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT>,
+ const matrix_base<NumericT>,
+ op_trans> & A,
+ const matrix_base<NumericT> & B,
+ matrix_base<NumericT> & C,
+ ScalarType alpha,
+ ScalarType beta)
+ {
+ assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C) && bool("Size check failed at C = prod(trans(A), B): size2(A) != size1(C)"));
+ assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B) && bool("Size check failed at C = prod(trans(A), B): size1(A) != size1(B)"));
+ assert(viennacl::traits::size2(B) == viennacl::traits::size2(C) && bool("Size check failed at C = prod(trans(A), B): size2(B) != size2(C)"));
+
+ switch (viennacl::traits::handle(A.lhs()).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::prod_impl(A.lhs(), true, B, false, C, alpha, beta);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::prod_impl(A.lhs(), true, B, false, C, alpha, beta);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::prod_impl(A.lhs(), true, B, false, C, alpha, beta);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+
+
+ /** @brief Carries out matrix-matrix multiplication
+ *
+ * Implementation of C = prod(A, trans(B));
+ *
+ */
+ template<typename NumericT, typename ScalarType >
+ void prod_impl(const matrix_base<NumericT> & A,
+ const viennacl::matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & B,
+ matrix_base<NumericT> & C,
+ ScalarType alpha,
+ ScalarType beta)
+ {
+ assert(viennacl::traits::size1(A) == viennacl::traits::size1(C) && bool("Size check failed at C = prod(A, trans(B)): size1(A) != size1(C)"));
+ assert(viennacl::traits::size2(A) == viennacl::traits::size2(B.lhs()) && bool("Size check failed at C = prod(A, trans(B)): size2(A) != size2(B)"));
+ assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C) && bool("Size check failed at C = prod(A, trans(B)): size1(B) != size2(C)"));
+
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::prod_impl(A, false, B.lhs(), true, C, alpha, beta);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::prod_impl(A, false, B.lhs(), true, C, alpha, beta);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::prod_impl(A, false, B.lhs(), true, C, alpha, beta);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+
+ /** @brief Carries out matrix-matrix multiplication
+ *
+ * Implementation of C = prod(trans(A), trans(B));
+ *
+ */
+ template<typename NumericT, typename ScalarType >
+ void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & A,
+ const viennacl::matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & B,
+ matrix_base<NumericT> & C,
+ ScalarType alpha,
+ ScalarType beta)
+ {
+ assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C) && bool("Size check failed at C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
+ assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size check failed at C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
+ assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C) && bool("Size check failed at C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
+
+ switch (viennacl::traits::handle(A.lhs()).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::prod_impl(A.lhs(), true, B.lhs(), true, C, alpha, beta);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::prod_impl(A.lhs(), true, B.lhs(), true, C, alpha, beta);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::prod_impl(A.lhs(), true, B.lhs(), true, C, alpha, beta);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ ///////////////////////// summation operations /////////////
+
+ template<typename NumericT>
+ void row_sum_impl(matrix_base<NumericT> const & A, vector_base<NumericT> & result)
+ {
+ viennacl::vector<NumericT> all_ones = viennacl::scalar_vector<NumericT>(A.size2(), NumericT(1), viennacl::traits::context(A));
+ viennacl::linalg::prod_impl(A, all_ones, result);
+ }
+
+ template<typename NumericT>
+ void column_sum_impl(matrix_base<NumericT> const & A, vector_base<NumericT> & result)
+ {
+ viennacl::vector<NumericT> all_ones = viennacl::scalar_vector<NumericT>(A.size1(), NumericT(1), viennacl::traits::context(A));
+ viennacl::linalg::prod_impl(matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>(A, A), all_ones, result);
+ }
+
+ ///////////////////////// Elementwise operations /////////////
+
+
+
+ /** @brief Implementation of the element-wise operation A = B .* C and A = B ./ C for matrices (using MATLAB syntax). Don't use this function directly, use element_prod() and element_div().
+ *
+ * @param A The result matrix (or -range, or -slice)
+ * @param proxy The proxy object holding B, C, and the operation
+ */
+ template<typename T, typename OP>
+ void element_op(matrix_base<T> & A,
+ matrix_expression<const matrix_base<T>, const matrix_base<T>, OP> const & proxy)
+ {
+ assert( (viennacl::traits::size1(A) == viennacl::traits::size1(proxy)) && bool("Size check failed at A = element_op(B): size1(A) != size1(B)"));
+ assert( (viennacl::traits::size2(A) == viennacl::traits::size2(proxy)) && bool("Size check failed at A = element_op(B): size2(A) != size2(B)"));
+
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::element_op(A, proxy);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::element_op(A, proxy);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::element_op(A, proxy);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+#define VIENNACL_MAKE_BINARY_OP(OPNAME)\
+ template<typename T>\
+ viennacl::matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_binary<op_##OPNAME> >\
+ element_##OPNAME(matrix_base<T> const & A, matrix_base<T> const & B)\
+ {\
+ return viennacl::matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_binary<op_##OPNAME> >(A, B);\
+ }\
+\
+ template<typename M1, typename M2, typename OP, typename T>\
+ viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP>,\
+ const matrix_base<T>,\
+ op_element_binary<op_##OPNAME> >\
+ element_##OPNAME(matrix_expression<const M1, const M2, OP> const & proxy, matrix_base<T> const & B)\
+ {\
+ return viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP>,\
+ const matrix_base<T>,\
+ op_element_binary<op_##OPNAME> >(proxy, B);\
+ }\
+\
+ template<typename T, typename M2, typename M3, typename OP>\
+ viennacl::matrix_expression<const matrix_base<T>,\
+ const matrix_expression<const M2, const M3, OP>,\
+ op_element_binary<op_##OPNAME> >\
+ element_##OPNAME(matrix_base<T> const & A, matrix_expression<const M2, const M3, OP> const & proxy)\
+ {\
+ return viennacl::matrix_expression<const matrix_base<T>,\
+ const matrix_expression<const M2, const M3, OP>,\
+ op_element_binary<op_##OPNAME> >(A, proxy);\
+ }\
+\
+ template<typename M1, typename M2, typename OP1,\
+ typename M3, typename M4, typename OP2>\
+ viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP1>,\
+ const matrix_expression<const M3, const M4, OP2>,\
+ op_element_binary<op_##OPNAME> >\
+ element_##OPNAME(matrix_expression<const M1, const M2, OP1> const & proxy1,\
+ matrix_expression<const M3, const M4, OP2> const & proxy2)\
+ {\
+ return viennacl::matrix_expression<const matrix_expression<const M1, const M2, OP1>,\
+ const matrix_expression<const M3, const M4, OP2>,\
+ op_element_binary<op_##OPNAME> >(proxy1, proxy2);\
+ }
+
+ VIENNACL_MAKE_BINARY_OP(prod)
+ VIENNACL_MAKE_BINARY_OP(div)
+ VIENNACL_MAKE_BINARY_OP(pow)
+
+ VIENNACL_MAKE_BINARY_OP(eq)
+ VIENNACL_MAKE_BINARY_OP(neq)
+ VIENNACL_MAKE_BINARY_OP(greater)
+ VIENNACL_MAKE_BINARY_OP(less)
+ VIENNACL_MAKE_BINARY_OP(geq)
+ VIENNACL_MAKE_BINARY_OP(leq)
+
+#undef VIENNACL_GENERATE_BINARY_OP_OVERLOADS
+
+
+
+#define VIENNACL_MAKE_UNARY_ELEMENT_OP(funcname) \
+ template<typename T> \
+ viennacl::matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_unary<op_##funcname> > \
+ element_##funcname(matrix_base<T> const & A) \
+ { \
+ return viennacl::matrix_expression<const matrix_base<T>, const matrix_base<T>, op_element_unary<op_##funcname> >(A, A); \
+ } \
+ template<typename LHS, typename RHS, typename OP> \
+ viennacl::matrix_expression<const matrix_expression<const LHS, const RHS, OP>, \
+ const matrix_expression<const LHS, const RHS, OP>, \
+ op_element_unary<op_##funcname> > \
+ element_##funcname(matrix_expression<const LHS, const RHS, OP> const & proxy) \
+ { \
+ return viennacl::matrix_expression<const matrix_expression<const LHS, const RHS, OP>, \
+ const matrix_expression<const LHS, const RHS, OP>, \
+ op_element_unary<op_##funcname> >(proxy, proxy); \
+ } \
+
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(abs)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(acos)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(asin)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(atan)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(ceil)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(cos)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(cosh)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(exp)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(fabs)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(floor)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(log)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(log10)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(sin)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(sinh)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(sqrt)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(tan)
+ VIENNACL_MAKE_UNARY_ELEMENT_OP(tanh)
+
+#undef VIENNACL_MAKE_UNARY_ELEMENT_OP
+
+
+ //
+ ///////////////////////// miscellaneous operations /////////////////////////////////
+ //
+
+
+ /** @brief Returns a proxy class for the operation mat += vec1 * vec2^T, i.e. a rank 1 update
+ *
+ * @param vec1 The first vector
+ * @param vec2 The second vector
+ */
+ template<typename NumericT>
+ viennacl::matrix_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_prod>
+ outer_prod(const vector_base<NumericT> & vec1, const vector_base<NumericT> & vec2)
+ {
+ return viennacl::matrix_expression< const vector_base<NumericT>, const vector_base<NumericT>, op_prod>(vec1, vec2);
+ }
+
+
+ /** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+ *
+ * Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+ *
+ * @param mat1 The matrix to be updated
+ * @param alpha The scaling factor (either a viennacl::scalar<>, float, or double)
+ * @param len_alpha Length of the buffer for an eventual final reduction step (currently always '1')
+ * @param reciprocal_alpha Use 1/alpha instead of alpha
+ * @param flip_sign_alpha Use -alpha instead of alpha
+ * @param vec1 The first vector
+ * @param vec2 The second vector
+ */
+ template<typename NumericT, typename S1>
+ void scaled_rank_1_update(matrix_base<NumericT> & mat1,
+ S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ const vector_base<NumericT> & vec1,
+ const vector_base<NumericT> & vec2)
+ {
+ switch (viennacl::traits::handle(mat1).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::scaled_rank_1_update(mat1,
+ alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ vec1, vec2);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::scaled_rank_1_update(mat1,
+ alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ vec1, vec2);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::scaled_rank_1_update(mat1,
+ alpha, len_alpha, reciprocal_alpha, flip_sign_alpha,
+ vec1, vec2);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+ /** @brief This function stores the diagonal and the superdiagonal of a matrix in two vectors.
+ *
+ *
+ * @param A The matrix from which the vectors will be extracted of.
+ * @param dh The vector in which the diagonal of the matrix will be stored in.
+ * @param sh The vector in which the superdiagonal of the matrix will be stored in.
+ */
+
+ template <typename NumericT, typename VectorType>
+ void bidiag_pack(matrix_base<NumericT> & A,
+ VectorType & dh,
+ VectorType & sh
+ )
+ {
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::bidiag_pack(A, dh, sh);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::bidiag_pack(A, dh, sh);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::bidiag_pack(A, dh, sh);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+
+
+ }
+ /** @brief This function copies a row or a column from a matrix to a vector.
+ *
+ *
+ * @param A The matrix where to copy from.
+ * @param V The vector to fill with data.
+ * @param row_start The number of the first row to copy.
+ * @param col_start The number of the first column to copy.
+ * @param copy_col Set to TRUE to copy a column, FALSE to copy a row.
+ */
+
+ template <typename SCALARTYPE>
+ void copy_vec(matrix_base<SCALARTYPE>& A,
+ vector_base<SCALARTYPE>& V,
+ vcl_size_t row_start,
+ vcl_size_t col_start,
+ bool copy_col
+ )
+ {
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::copy_vec(A, V, row_start, col_start, copy_col);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::copy_vec(A, V, row_start, col_start, copy_col);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::copy_vec(A, V, row_start, col_start, copy_col);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+
+ }
+
+ /** @brief This function applies a householder transformation to a matrix. A <- P * A with a householder reflection P
+ *
+ * @param A The matrix to be updated.
+ * @param D The normalized householder vector.
+ * @param start The repetition counter.
+ */
+ template <typename NumericT>
+ void house_update_A_left(matrix_base<NumericT> & A,
+ vector_base<NumericT> & D,
+ vcl_size_t start)
+ {
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::house_update_A_left(A, D, start);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::house_update_A_left(A, D, start);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::house_update_A_left(A, D, start);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ /** @brief This function applies a householder transformation to a matrix: A <- A * P with a householder reflection P
+ *
+ *
+ * @param A The matrix to be updated.
+ * @param D The normalized householder vector.
+ */
+
+ template <typename NumericT>
+ void house_update_A_right(matrix_base<NumericT>& A,
+ vector_base<NumericT> & D)
+ {
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::house_update_A_right(A, D);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::house_update_A_right(A, D);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::house_update_A_right(A, D);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+ /** @brief This function updates the matrix Q, which is needed for the computation of the eigenvectors.
+ *
+ * @param Q The matrix to be updated.
+ * @param D The householder vector.
+ * @param A_size1 size1 of matrix A
+ */
+
+ template <typename NumericT>
+ void house_update_QL(matrix_base<NumericT> & Q,
+ vector_base<NumericT> & D,
+ vcl_size_t A_size1)
+ {
+ switch (viennacl::traits::handle(Q).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::house_update_QL(Q, D, A_size1);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::house_update_QL(Q, D, A_size1);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::house_update_QL(Q, D, A_size1);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+ /** @brief This function updates the matrix Q. It is part of the tql2 algorithm.
+ *
+ *
+ * @param Q The matrix to be updated.
+ * @param tmp1 Vector with data from the tql2 algorithm.
+ * @param tmp2 Vector with data from the tql2 algorithm.
+ * @param l Data from the tql2 algorithm.
+ * @param m Data from the tql2 algorithm.
+ */
+ template<typename NumericT>
+ void givens_next(matrix_base<NumericT> & Q,
+ vector_base<NumericT> & tmp1,
+ vector_base<NumericT> & tmp2,
+ int l,
+ int m
+ )
+ {
+ switch (viennacl::traits::handle(Q).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::givens_next(Q, tmp1, tmp2, l, m);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::givens_next(Q, tmp1, tmp2, l, m);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::givens_next(Q, tmp1, tmp2, l, m);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+ } //namespace linalg
+
+
+
+
+ //
+ ///////////////////////// Operator overloads /////////////////////////////////
+ //
+
+
+ //v += A * x
+ /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
+ *
+ * @param v1 The result vector v1 where A * v2 is added to
+ * @param proxy An expression template proxy class.
+ */
+ template<typename NumericT>
+ vector<NumericT>
+ operator+=(vector_base<NumericT> & v1,
+ const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy)
+ {
+ assert(viennacl::traits::size1(proxy.lhs()) == v1.size() && bool("Size check failed for v1 += A * v2: size1(A) != size(v1)"));
+
+ vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+ v1 += result;
+ return v1;
+ }
+
+ /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
+ *
+ * @param v1 The result vector v1 where A * v2 is subtracted from
+ * @param proxy An expression template proxy class.
+ */
+ template<typename NumericT>
+ vector<NumericT>
+ operator-=(vector_base<NumericT> & v1,
+ const viennacl::vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, viennacl::op_prod> & proxy)
+ {
+ assert(viennacl::traits::size1(proxy.lhs()) == v1.size() && bool("Size check failed for v1 -= A * v2: size1(A) != size(v1)"));
+
+ vector<NumericT> result(viennacl::traits::size1(proxy.lhs()));
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+ v1 -= result;
+ return v1;
+ }
+
+
+
+
+
+ //free functions:
+ /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
+ *
+ * @param v1 The addend vector.
+ * @param proxy An expression template proxy class.
+ */
+ template<typename NumericT>
+ viennacl::vector<NumericT>
+ operator+(const vector_base<NumericT> & v1,
+ const vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy)
+ {
+ assert(viennacl::traits::size1(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed for v1 + A * v2: size1(A) != size(v1)"));
+
+ vector<NumericT> result(viennacl::traits::size(v1));
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+ result += v1;
+ return result;
+ }
+
+ /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
+ *
+ * @param v1 The addend vector.
+ * @param proxy An expression template proxy class.
+ */
+ template<typename NumericT>
+ viennacl::vector<NumericT>
+ operator-(const vector_base<NumericT> & v1,
+ const vector_expression< const matrix_base<NumericT>, const vector_base<NumericT>, op_prod> & proxy)
+ {
+ assert(viennacl::traits::size1(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed for v1 - A * v2: size1(A) != size(v1)"));
+
+ vector<NumericT> result(viennacl::traits::size(v1));
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+ result = v1 - result;
+ return result;
+ }
+
+
+ ////////// transposed_matrix_proxy
+
+
+ //v += A^T * x
+ /** @brief Implementation of the operation v1 += A * v2, where A is a matrix
+ *
+ * @param v1 The addend vector where the result is written to.
+ * @param proxy An expression template proxy class.
+ */
+ template<typename NumericT>
+ vector<NumericT>
+ operator+=(vector_base<NumericT> & v1,
+ const vector_expression< const matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>,
+ const vector_base<NumericT>,
+ op_prod> & proxy)
+ {
+ assert(viennacl::traits::size2(proxy.lhs()) == v1.size() && bool("Size check failed in v1 += trans(A) * v2: size2(A) != size(v1)"));
+
+ vector<NumericT> result(viennacl::traits::size2(proxy.lhs()));
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+ v1 += result;
+ return v1;
+ }
+
+ //v -= A^T * x
+ /** @brief Implementation of the operation v1 -= A * v2, where A is a matrix
+ *
+ * @param v1 The addend vector where the result is written to.
+ * @param proxy An expression template proxy class.
+ */
+ template<typename NumericT>
+ vector<NumericT>
+ operator-=(vector_base<NumericT> & v1,
+ const vector_expression< const matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>,
+ const vector_base<NumericT>,
+ op_prod> & proxy)
+ {
+ assert(viennacl::traits::size2(proxy.lhs()) == v1.size() && bool("Size check failed in v1 += trans(A) * v2: size2(A) != size(v1)"));
+
+ vector<NumericT> result(viennacl::traits::size2(proxy.lhs()));
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+ v1 -= result;
+ return v1;
+ }
+
+
+ //free functions:
+ /** @brief Implementation of the operation 'result = v1 + A * v2', where A is a matrix
+ *
+ * @param v1 The addend vector.
+ * @param proxy An expression template proxy class.
+ */
+ template<typename NumericT>
+ vector<NumericT>
+ operator+(const vector_base<NumericT> & v1,
+ const vector_expression< const matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>,
+ const vector_base<NumericT>,
+ op_prod> & proxy)
+ {
+ assert(viennacl::traits::size2(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed in v1 + trans(A) * v2: size2(A) != size(v1)"));
+
+ vector<NumericT> result(viennacl::traits::size(v1));
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+ result += v1;
+ return result;
+ }
+
+ /** @brief Implementation of the operation 'result = v1 - A * v2', where A is a matrix
+ *
+ * @param v1 The addend vector.
+ * @param proxy An expression template proxy class.
+ */
+ template<typename NumericT>
+ vector<NumericT>
+ operator-(const vector_base<NumericT> & v1,
+ const vector_expression< const matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans>,
+ const vector_base<NumericT>,
+ op_prod> & proxy)
+ {
+ assert(viennacl::traits::size2(proxy.lhs()) == viennacl::traits::size(v1) && bool("Size check failed in v1 - trans(A) * v2: size2(A) != size(v1)"));
+
+ vector<NumericT> result(viennacl::traits::size(v1));
+ viennacl::linalg::prod_impl(proxy.lhs(), proxy.rhs(), result);
+ result = v1 - result;
+ return result;
+ }
+
+
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/maxmin.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/maxmin.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/maxmin.hpp
new file mode 100644
index 0000000..9269598
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/maxmin.hpp
@@ -0,0 +1,152 @@
+#ifndef VIENNACL_LINALG_MAXMIN_HPP_
+#define VIENNACL_LINALG_MAXMIN_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_inf.hpp
+ @brief Generic interface for the l^infty-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+ //
+ // generic norm_inf function
+ // uses tag dispatch to identify which algorithm
+ // should be called
+ //
+ namespace linalg
+ {
+
+
+ // ----------------------------------------------------
+ // STL
+ //
+ template< typename NumericT >
+ NumericT max(std::vector<NumericT> const & v1)
+ {
+ //std::cout << "stl .. " << std::endl;
+ NumericT result = v1[0];
+ for (vcl_size_t i=1; i<v1.size(); ++i)
+ {
+ if (v1[i] > result)
+ result = v1[i];
+ }
+
+ return result;
+ }
+
+ // ----------------------------------------------------
+ // VIENNACL
+ //
+ template< typename ScalarType>
+ viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_max >
+ max(viennacl::vector_base<ScalarType> const & v1)
+ {
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_max >(v1, v1);
+ }
+
+ // with vector expression:
+ template<typename LHS, typename RHS, typename OP>
+ viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_max>
+ max(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+ {
+ return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_max >(vector, vector);
+ }
+
+ // ----------------------------------------------------
+ // STL
+ //
+ template< typename NumericT >
+ NumericT min(std::vector<NumericT> const & v1)
+ {
+ //std::cout << "stl .. " << std::endl;
+ NumericT result = v1[0];
+ for (vcl_size_t i=1; i<v1.size(); ++i)
+ {
+ if (v1[i] < result)
+ result = v1[i];
+ }
+
+ return result;
+ }
+
+ // ----------------------------------------------------
+ // VIENNACL
+ //
+ template< typename ScalarType>
+ viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_min >
+ min(viennacl::vector_base<ScalarType> const & v1)
+ {
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_min >(v1, v1);
+ }
+
+ template< typename ScalarType>
+ viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_min >
+ min(viennacl::vector<ScalarType> const & v1)
+ {
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_min >(v1, v1);
+ }
+
+ // with vector expression:
+ template<typename LHS, typename RHS, typename OP>
+ viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_min>
+ min(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+ {
+ return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_min >(vector, vector);
+ }
+
+
+
+ } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/misc_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/misc_operations.hpp
new file mode 100644
index 0000000..208573f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/misc_operations.hpp
@@ -0,0 +1,94 @@
+#ifndef VIENNACL_LINALG_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/misc_operations.hpp
+ @brief Implementations of miscellaneous operations
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/host_based/misc_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+ #include "viennacl/linalg/opencl/misc_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ #include "viennacl/linalg/cuda/misc_operations.hpp"
+#endif
+
+namespace viennacl
+{
+ namespace linalg
+ {
+
+ namespace detail
+ {
+
+ template<typename ScalarType>
+ void level_scheduling_substitute(vector<ScalarType> & vec,
+ viennacl::backend::mem_handle const & row_index_array,
+ viennacl::backend::mem_handle const & row_buffer,
+ viennacl::backend::mem_handle const & col_buffer,
+ viennacl::backend::mem_handle const & element_buffer,
+ vcl_size_t num_rows
+ )
+ {
+ assert( viennacl::traits::handle(vec).get_active_handle_id() == row_index_array.get_active_handle_id() && bool("Incompatible memory domains"));
+ assert( viennacl::traits::handle(vec).get_active_handle_id() == row_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+ assert( viennacl::traits::handle(vec).get_active_handle_id() == col_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+ assert( viennacl::traits::handle(vec).get_active_handle_id() == element_buffer.get_active_handle_id() && bool("Incompatible memory domains"));
+
+ switch (viennacl::traits::handle(vec).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::detail::level_scheduling_substitute(vec, row_index_array, row_buffer, col_buffer, element_buffer, num_rows);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+
+
+ } //namespace detail
+
+
+ } //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/mixed_precision_cg.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/mixed_precision_cg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/mixed_precision_cg.hpp
new file mode 100644
index 0000000..78254b3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/mixed_precision_cg.hpp
@@ -0,0 +1,199 @@
+#ifndef VIENNACL_LINALG_MIXED_PRECISION_CG_HPP_
+#define VIENNACL_LINALG_MIXED_PRECISION_CG_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/mixed_precision_cg.hpp
+ @brief The conjugate gradient method using mixed precision is implemented here. Experimental.
+*/
+
+#include <vector>
+#include <map>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/traits/clear.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/vector_proxy.hpp"
+
+namespace viennacl
+{
+ namespace linalg
+ {
+
+ /** @brief A tag for the conjugate gradient Used for supplying solver parameters and for dispatching the solve() function
+ */
+ class mixed_precision_cg_tag
+ {
+ public:
+ /** @brief The constructor
+ *
+ * @param tol Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||)
+ * @param max_iterations The maximum number of iterations
+ * @param inner_tol Inner tolerance for the low-precision iterations
+ */
+ mixed_precision_cg_tag(double tol = 1e-8, unsigned int max_iterations = 300, float inner_tol = 1e-2f) : tol_(tol), iterations_(max_iterations), inner_tol_(inner_tol) {}
+
+ /** @brief Returns the relative tolerance */
+ double tolerance() const { return tol_; }
+ /** @brief Returns the relative tolerance */
+ float inner_tolerance() const { return inner_tol_; }
+ /** @brief Returns the maximum number of iterations */
+ unsigned int max_iterations() const { return iterations_; }
+
+ /** @brief Return the number of solver iterations: */
+ unsigned int iters() const { return iters_taken_; }
+ void iters(unsigned int i) const { iters_taken_ = i; }
+
+ /** @brief Returns the estimated relative error at the end of the solver run */
+ double error() const { return last_error_; }
+ /** @brief Sets the estimated relative error at the end of the solver run */
+ void error(double e) const { last_error_ = e; }
+
+
+ private:
+ double tol_;
+ unsigned int iterations_;
+ float inner_tol_;
+
+ //return values from solver
+ mutable unsigned int iters_taken_;
+ mutable double last_error_;
+ };
+
+
+ /** @brief Implementation of the conjugate gradient solver without preconditioner
+ *
+ * Following the algorithm in the book by Y. Saad "Iterative Methods for sparse linear systems"
+ *
+ * @param matrix The system matrix
+ * @param rhs The load vector
+ * @param tag Solver configuration tag
+ * @return The result vector
+ */
+ template<typename MatrixType, typename VectorType>
+ VectorType solve(const MatrixType & matrix, VectorType const & rhs, mixed_precision_cg_tag const & tag)
+ {
+ //typedef typename VectorType::value_type ScalarType;
+ typedef typename viennacl::result_of::cpu_value_type<VectorType>::type CPU_ScalarType;
+
+ //std::cout << "Starting CG" << std::endl;
+ vcl_size_t problem_size = viennacl::traits::size(rhs);
+ VectorType result(rhs);
+ viennacl::traits::clear(result);
+
+ VectorType residual = rhs;
+
+ CPU_ScalarType ip_rr = viennacl::linalg::inner_prod(rhs, rhs);
+ CPU_ScalarType new_ip_rr = 0;
+ CPU_ScalarType norm_rhs_squared = ip_rr;
+
+ if (norm_rhs_squared <= 0) //solution is zero if RHS norm is zero
+ return result;
+
+ viennacl::vector<float> residual_low_precision(problem_size, viennacl::traits::context(rhs));
+ viennacl::vector<float> result_low_precision(problem_size, viennacl::traits::context(rhs));
+ viennacl::vector<float> p_low_precision(problem_size, viennacl::traits::context(rhs));
+ viennacl::vector<float> tmp_low_precision(problem_size, viennacl::traits::context(rhs));
+ float inner_ip_rr = static_cast<float>(ip_rr);
+ float new_inner_ip_rr = 0;
+ float initial_inner_rhs_norm_squared = static_cast<float>(ip_rr);
+ float alpha;
+ float beta;
+
+ // transfer rhs to single precision:
+ p_low_precision = rhs;
+ residual_low_precision = p_low_precision;
+
+ // transfer matrix to single precision:
+ viennacl::compressed_matrix<float> matrix_low_precision(matrix.size1(), matrix.size2(), matrix.nnz(), viennacl::traits::context(rhs));
+ viennacl::backend::memory_copy(matrix.handle1(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle1()), 0, 0, matrix_low_precision.handle1().raw_size() );
+ viennacl::backend::memory_copy(matrix.handle2(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle2()), 0, 0, matrix_low_precision.handle2().raw_size() );
+
+ viennacl::vector_base<CPU_ScalarType> matrix_elements_high_precision(const_cast<viennacl::backend::mem_handle &>(matrix.handle()), matrix.nnz(), 0, 1);
+ viennacl::vector_base<float> matrix_elements_low_precision(matrix_low_precision.handle(), matrix.nnz(), 0, 1);
+ matrix_elements_low_precision = matrix_elements_high_precision;
+ matrix_low_precision.generate_row_block_information();
+
+ for (unsigned int i = 0; i < tag.max_iterations(); ++i)
+ {
+ tag.iters(i+1);
+
+ // lower precision 'inner iteration'
+ tmp_low_precision = viennacl::linalg::prod(matrix_low_precision, p_low_precision);
+
+ alpha = inner_ip_rr / viennacl::linalg::inner_prod(tmp_low_precision, p_low_precision);
+ result_low_precision += alpha * p_low_precision;
+ residual_low_precision -= alpha * tmp_low_precision;
+
+ new_inner_ip_rr = viennacl::linalg::inner_prod(residual_low_precision, residual_low_precision);
+
+ beta = new_inner_ip_rr / inner_ip_rr;
+ inner_ip_rr = new_inner_ip_rr;
+
+ p_low_precision = residual_low_precision + beta * p_low_precision;
+
+ //
+ // If enough progress has been achieved, update current residual with high precision evaluation
+ // This is effectively a restart of the CG method
+ //
+ if (new_inner_ip_rr < tag.inner_tolerance() * initial_inner_rhs_norm_squared || i == tag.max_iterations()-1)
+ {
+ residual = result_low_precision; // reusing residual vector as temporary buffer for conversion. Overwritten below anyway
+ result += residual;
+
+ // residual = b - Ax (without introducing a temporary)
+ residual = viennacl::linalg::prod(matrix, result);
+ residual = rhs - residual;
+
+ new_ip_rr = viennacl::linalg::inner_prod(residual, residual);
+ if (new_ip_rr / norm_rhs_squared < tag.tolerance() * tag.tolerance())//squared norms involved here
+ break;
+
+ p_low_precision = residual;
+
+ result_low_precision.clear();
+ residual_low_precision = p_low_precision;
+ initial_inner_rhs_norm_squared = static_cast<float>(new_ip_rr);
+ inner_ip_rr = static_cast<float>(new_ip_rr);
+ }
+ }
+
+ //store last error estimate:
+ tag.error(std::sqrt(new_ip_rr / norm_rhs_squared));
+
+ return result;
+ }
+
+ template<typename MatrixType, typename VectorType>
+ VectorType solve(const MatrixType & matrix, VectorType const & rhs, mixed_precision_cg_tag const & tag, viennacl::linalg::no_precond)
+ {
+ return solve(matrix, rhs, tag);
+ }
+
+
+ }
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/nmf.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/nmf.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/nmf.hpp
new file mode 100644
index 0000000..c962c8e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/nmf.hpp
@@ -0,0 +1,91 @@
+#ifndef VIENNACL_LINALG_NMF_HPP
+#define VIENNACL_LINALG_NMF_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/nmf.hpp
+ @brief Provides a nonnegative matrix factorization implementation. Experimental.
+
+
+ */
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/linalg/norm_frobenius.hpp"
+
+#include "viennacl/linalg/host_based/nmf_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/linalg/opencl/kernels/nmf.hpp"
+#include "viennacl/linalg/opencl/nmf_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/linalg/cuda/nmf_operations.hpp"
+#endif
+
+namespace viennacl
+{
+ namespace linalg
+ {
+
+ /** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized.
+ *
+ * @param V Input matrix
+ * @param W First factor
+ * @param H Second factor
+ * @param conf A configuration object holding tolerances and the like
+ */
+ template<typename ScalarType>
+ void nmf(viennacl::matrix_base<ScalarType> const & V, viennacl::matrix_base<ScalarType> & W,
+ viennacl::matrix_base<ScalarType> & H, viennacl::linalg::nmf_config const & conf)
+ {
+ assert(V.size1() == W.size1() && V.size2() == H.size2() && bool("Dimensions of W and H don't allow for V = W * H"));
+ assert(W.size2() == H.size1() && bool("Dimensions of W and H don't match, prod(W, H) impossible"));
+
+ switch (viennacl::traits::handle(V).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::nmf(V, W, H, conf);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::nmf(V,W,H,conf);
+ break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::nmf(V,W,H,conf);
+ break;
+#endif
+
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+
+ }
+
+ }
+ }
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_1.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_1.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_1.hpp
new file mode 100644
index 0000000..e16238b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_1.hpp
@@ -0,0 +1,104 @@
+#ifndef VIENNACL_LINALG_NORM_1_HPP_
+#define VIENNACL_LINALG_NORM_1_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_1.hpp
+ @brief Generic interface for the l^1-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+ //
+ // generic norm_1 function
+ // uses tag dispatch to identify which algorithm
+ // should be called
+ //
+ namespace linalg
+ {
+
+ #ifdef VIENNACL_WITH_UBLAS
+ // ----------------------------------------------------
+ // UBLAS
+ //
+ template< typename VectorT >
+ typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+ typename VectorT::value_type
+ >::type
+ norm_1(VectorT const& vector)
+ {
+ // std::cout << "ublas .. " << std::endl;
+ return boost::numeric::ublas::norm_1(vector);
+ }
+ #endif
+
+
+ // ----------------------------------------------------
+ // STL
+ //
+ template< typename T, typename A >
+ T norm_1(std::vector<T, A> const & v1)
+ {
+ //std::cout << "stl .. " << std::endl;
+ T result = 0;
+ for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
+ result += std::fabs(v1[i]);
+
+ return result;
+ }
+
+ // ----------------------------------------------------
+ // VIENNACL
+ //
+ template< typename ScalarType>
+ viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_norm_1 >
+ norm_1(viennacl::vector_base<ScalarType> const & vector)
+ {
+ return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_norm_1 >(vector, vector);
+ }
+
+ // with vector expression:
+ template<typename LHS, typename RHS, typename OP>
+ viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_norm_1>
+ norm_1(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+ {
+ return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_norm_1 >(vector, vector);
+ }
+
+ } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_2.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_2.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_2.hpp
new file mode 100644
index 0000000..babb285
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_2.hpp
@@ -0,0 +1,140 @@
+#ifndef VIENNACL_LINALG_NORM_2_HPP_
+#define VIENNACL_LINALG_NORM_2_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_2.hpp
+ @brief Generic interface for the l^2-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+ //
+ // generic norm_2 function
+ // uses tag dispatch to identify which algorithm
+ // should be called
+ //
+ namespace linalg
+ {
+ #ifdef VIENNACL_WITH_MTL4
+ // ----------------------------------------------------
+ // MTL4
+ //
+ template< typename VectorT >
+ typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT >::type >::value,
+ typename VectorT::value_type>::type
+ norm_2(VectorT const & v)
+ {
+ return mtl::two_norm(v);
+ }
+ #endif
+
+ #ifdef VIENNACL_WITH_ARMADILLO
+ // ----------------------------------------------------
+ // Armadillo
+ //
+ template<typename NumericT>
+ NumericT norm_2(arma::Col<NumericT> const& v)
+ {
+ return norm(v);
+ }
+ #endif
+
+ #ifdef VIENNACL_WITH_EIGEN
+ // ----------------------------------------------------
+ // EIGEN
+ //
+ template< typename VectorT >
+ typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT >::type >::value,
+ typename VectorT::RealScalar>::type
+ norm_2(VectorT const & v)
+ {
+ return v.norm();
+ }
+ #endif
+
+
+ #ifdef VIENNACL_WITH_UBLAS
+ // ----------------------------------------------------
+ // UBLAS
+ //
+ template< typename VectorT >
+ typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+ typename VectorT::value_type>::type
+ norm_2(VectorT const & v)
+ {
+ return boost::numeric::ublas::norm_2(v);
+ }
+ #endif
+
+
+ // ----------------------------------------------------
+ // STL
+ //
+ template< typename T, typename A >
+ T norm_2(std::vector<T, A> const & v1)
+ {
+ T result = 0;
+ for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
+ result += v1[i] * v1[i];
+
+ return std::sqrt(result);
+ }
+
+ // ----------------------------------------------------
+ // VIENNACL
+ //
+ template< typename ScalarType>
+ viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_norm_2 >
+ norm_2(viennacl::vector_base<ScalarType> const & v)
+ {
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_norm_2 >(v, v);
+ }
+
+ // with vector expression:
+ template<typename LHS, typename RHS, typename OP>
+ viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_norm_2>
+ norm_2(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+ {
+ return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_norm_2>(vector, vector);
+ }
+
+
+ } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_frobenius.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_frobenius.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_frobenius.hpp
new file mode 100644
index 0000000..6873a53
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_frobenius.hpp
@@ -0,0 +1,73 @@
+#ifndef VIENNACL_LINALG_NORM_FROBENIUS_HPP_
+#define VIENNACL_LINALG_NORM_FROBENIUS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/norm_frobenius.hpp
+ @brief Generic interface for the Frobenius norm.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+ //
+ // generic norm_frobenius function
+ // uses tag dispatch to identify which algorithm
+ // should be called
+ //
+ namespace linalg
+ {
+
+ #ifdef VIENNACL_WITH_UBLAS
+ // ----------------------------------------------------
+ // UBLAS
+ //
+ template< typename VectorT >
+ typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+ typename VectorT::value_type
+ >::type
+ norm_frobenius(VectorT const& v1)
+ {
+ return boost::numeric::ublas::norm_frobenius(v1);
+ }
+ #endif
+
+
+ // ----------------------------------------------------
+ // VIENNACL
+ //
+ template<typename NumericT>
+ scalar_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_norm_frobenius>
+ norm_frobenius(const matrix_base<NumericT> & A)
+ {
+ return scalar_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_norm_frobenius>(A, A);
+ }
+
+ } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+
[22/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
new file mode 100644
index 0000000..1d212c2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp
@@ -0,0 +1,307 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/direct_solve.hpp
+ @brief Implementations of dense direct triangular solvers are found here.
+*/
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+namespace detail
+{
+ //
+ // Upper solve:
+ //
+ template<typename MatrixT1, typename MatrixT2>
+ void upper_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
+ {
+ typedef typename MatrixT2::value_type value_type;
+
+ for (vcl_size_t i = 0; i < A_size; ++i)
+ {
+ vcl_size_t current_row = A_size - i - 1;
+
+ for (vcl_size_t j = current_row + 1; j < A_size; ++j)
+ {
+ value_type A_element = A(current_row, j);
+ for (vcl_size_t k=0; k < B_size; ++k)
+ B(current_row, k) -= A_element * B(j, k);
+ }
+
+ if (!unit_diagonal)
+ {
+ value_type A_diag = A(current_row, current_row);
+ for (vcl_size_t k=0; k < B_size; ++k)
+ B(current_row, k) /= A_diag;
+ }
+ }
+ }
+
+ template<typename MatrixT1, typename MatrixT2>
+ void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_upper_tag)
+ {
+ upper_inplace_solve_matrix(A, B, A_size, B_size, true);
+ }
+
+ template<typename MatrixT1, typename MatrixT2>
+ void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::upper_tag)
+ {
+ upper_inplace_solve_matrix(A, B, A_size, B_size, false);
+ }
+
+ //
+ // Lower solve:
+ //
+ template<typename MatrixT1, typename MatrixT2>
+ void lower_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal)
+ {
+ typedef typename MatrixT2::value_type value_type;
+
+ for (vcl_size_t i = 0; i < A_size; ++i)
+ {
+ for (vcl_size_t j = 0; j < i; ++j)
+ {
+ value_type A_element = A(i, j);
+ for (vcl_size_t k=0; k < B_size; ++k)
+ B(i, k) -= A_element * B(j, k);
+ }
+
+ if (!unit_diagonal)
+ {
+ value_type A_diag = A(i, i);
+ for (vcl_size_t k=0; k < B_size; ++k)
+ B(i, k) /= A_diag;
+ }
+ }
+ }
+
+ template<typename MatrixT1, typename MatrixT2>
+ void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_lower_tag)
+ {
+ lower_inplace_solve_matrix(A, B, A_size, B_size, true);
+ }
+
+ template<typename MatrixT1, typename MatrixT2>
+ void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::lower_tag)
+ {
+ lower_inplace_solve_matrix(A, B, A_size, B_size, false);
+ }
+
+}
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+//
+
+////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
+/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation)
+*
+* @param A The system matrix
+* @param B The matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+ matrix_base<NumericT> & B,
+ SolverTagT)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+ value_type * data_B = detail::extract_raw_pointer<value_type>(B);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(A);
+ vcl_size_t A_start2 = viennacl::traits::start2(A);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(A);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(A);
+ //vcl_size_t A_size1 = viennacl::traits::size1(A);
+ vcl_size_t A_size2 = viennacl::traits::size2(A);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(B);
+ vcl_size_t B_start2 = viennacl::traits::start2(B);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(B);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(B);
+ //vcl_size_t B_size1 = viennacl::traits::size1(B);
+ vcl_size_t B_size2 = viennacl::traits::size2(B);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B);
+
+
+ if (A.row_major() && B.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+ detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+ }
+ else if (A.row_major() && !B.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+ detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+ }
+ else if (!A.row_major() && B.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+ detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+ detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT());
+ }
+}
+
+
+//
+// Solve on vector
+//
+
+namespace detail
+{
+ //
+ // Upper solve:
+ //
+ template<typename MatrixT, typename VectorT>
+ void upper_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
+ {
+ typedef typename VectorT::value_type value_type;
+
+ for (vcl_size_t i = 0; i < A_size; ++i)
+ {
+ vcl_size_t current_row = A_size - i - 1;
+
+ for (vcl_size_t j = current_row + 1; j < A_size; ++j)
+ {
+ value_type A_element = A(current_row, j);
+ b(current_row) -= A_element * b(j);
+ }
+
+ if (!unit_diagonal)
+ b(current_row) /= A(current_row, current_row);
+ }
+ }
+
+ template<typename MatrixT, typename VectorT>
+ void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_upper_tag)
+ {
+ upper_inplace_solve_vector(A, b, A_size, true);
+ }
+
+ template<typename MatrixT, typename VectorT>
+ void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::upper_tag)
+ {
+ upper_inplace_solve_vector(A, b, A_size, false);
+ }
+
+ //
+ // Lower solve:
+ //
+ template<typename MatrixT, typename VectorT>
+ void lower_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal)
+ {
+ typedef typename VectorT::value_type value_type;
+
+ for (vcl_size_t i = 0; i < A_size; ++i)
+ {
+ for (vcl_size_t j = 0; j < i; ++j)
+ {
+ value_type A_element = A(i, j);
+ b(i) -= A_element * b(j);
+ }
+
+ if (!unit_diagonal)
+ b(i) /= A(i, i);
+ }
+ }
+
+ template<typename MatrixT, typename VectorT>
+ void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_lower_tag)
+ {
+ lower_inplace_solve_vector(A, b, A_size, true);
+ }
+
+ template<typename MatrixT, typename VectorT>
+ void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::lower_tag)
+ {
+ lower_inplace_solve_vector(A, b, A_size, false);
+ }
+
+}
+
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & mat,
+ vector_base<NumericT> & vec,
+ SolverTagT)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type * data_v = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t start1 = viennacl::traits::start(vec);
+ vcl_size_t inc1 = viennacl::traits::stride(vec);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
+
+ detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1);
+
+ detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT());
+ }
+}
+
+
+} // namespace host_based
+} // namespace linalg
+} // namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
new file mode 100644
index 0000000..f53f8f2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp
@@ -0,0 +1,856 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled execution on CPU
+ */
+
+//TODO openom Conditions
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+
+#include <stdexcept>
+#include <cmath>
+#include <complex>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+ namespace fft
+ {
+ const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+ namespace FFT_DATA_ORDER
+ {
+ enum DATA_ORDER
+ {
+ ROW_MAJOR, COL_MAJOR
+ };
+ }
+
+ inline vcl_size_t num_bits(vcl_size_t size)
+ {
+ vcl_size_t bits_datasize = 0;
+ vcl_size_t ds = 1;
+
+ while (ds < size)
+ {
+ ds = ds << 1;
+ bits_datasize++;
+ }
+
+ return bits_datasize;
+ }
+
+ inline vcl_size_t next_power_2(vcl_size_t n)
+ {
+ n = n - 1;
+
+ vcl_size_t power = 1;
+
+ while (power < sizeof(vcl_size_t) * 8)
+ {
+ n = n | (n >> power);
+ power *= 2;
+ }
+
+ return n + 1;
+ }
+
+ inline vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size)
+ {
+ v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+ v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+ v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+ v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+ v = (v >> 16) | (v << 16);
+ v = v >> (32 - bit_size);
+ return v;
+ }
+
+ template<typename NumericT, unsigned int AlignmentV>
+ void copy_to_complex_array(std::complex<NumericT> * input_complex,
+ viennacl::vector<NumericT, AlignmentV> const & in, vcl_size_t size)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+ { //change array to complex array
+ vcl_size_t i = vcl_size_t(i2);
+ input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+ }
+ }
+
+ template<typename NumericT>
+ void copy_to_complex_array(std::complex<NumericT> * input_complex,
+ viennacl::vector_base<NumericT> const & in, vcl_size_t size)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+ { //change array to complex array
+ vcl_size_t i = vcl_size_t(i2);
+ input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+ }
+ }
+
+ template<typename NumericT, unsigned int AlignmentV>
+ void copy_to_vector(std::complex<NumericT> * input_complex,
+ viennacl::vector<NumericT, AlignmentV> & in, vcl_size_t size)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ in(i * 2) = static_cast<NumericT>(std::real(input_complex[i]));
+ in(i * 2 + 1) = static_cast<NumericT>(std::imag(input_complex[i]));
+ }
+ }
+
+ template<typename NumericT>
+ void copy_to_complex_array(std::complex<NumericT> * input_complex,
+ NumericT const * in, vcl_size_t size)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size * 2); i2 += 2)
+ { //change array to complex array
+ vcl_size_t i = vcl_size_t(i2);
+ input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]);
+ }
+ }
+
+ template<typename NumericT>
+ void copy_to_vector(std::complex<NumericT> * input_complex, NumericT * in, vcl_size_t size)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ in[i * 2] = static_cast<NumericT>(std::real(input_complex[i]));
+ in[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
+ }
+ }
+
+ template<typename NumericT>
+ void copy_to_vector(std::complex<NumericT> * input_complex,
+ viennacl::vector_base<NumericT> & in, vcl_size_t size)
+ {
+ std::vector<NumericT> temp(2 * size);
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ temp[i * 2] = static_cast<NumericT>(std::real(input_complex[i]));
+ temp[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i]));
+ }
+ viennacl::copy(temp, in);
+ }
+
+ template<typename NumericT>
+ void zero2(NumericT *input1, NumericT *input2, vcl_size_t size)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ input1[i] = 0;
+ input2[i] = 0;
+ }
+ }
+
+ } //namespace fft
+
+} //namespace detail
+
+/**
+ * @brief Direct algoritm kenrnel
+ */
+template<typename NumericT>
+void fft_direct(std::complex<NumericT> * input_complex, std::complex<NumericT> * output,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ NumericT const NUM_PI = NumericT(3.14159265358979323846);
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel
+#endif
+ for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+ {
+ vcl_size_t batch_id = vcl_size_t(batch_id2);
+ for (vcl_size_t k = 0; k < size; k += 1)
+ {
+ std::complex<NumericT> f = 0;
+ for (vcl_size_t n = 0; n < size; n++)
+ {
+ std::complex<NumericT> input;
+ if (!data_order)
+ input = input_complex[batch_id * stride + n]; //input index here
+ else
+ input = input_complex[n * stride + batch_id];
+ NumericT arg = sign * 2 * NUM_PI * NumericT(k) / NumericT(size * n);
+ NumericT sn = std::sin(arg);
+ NumericT cs = std::cos(arg);
+
+ std::complex<NumericT> ex(cs, sn);
+ std::complex<NumericT> tmp(input.real() * ex.real() - input.imag() * ex.imag(),
+ input.real() * ex.imag() + input.imag() * ex.real());
+ f = f + tmp;
+ }
+ if (!data_order)
+ output[batch_id * stride + k] = f; // output index here
+ else
+ output[k * stride + batch_id] = f;
+ }
+ }
+
+}
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+ viennacl::vector<NumericT, AlignmentV> & out,
+ vcl_size_t size, vcl_size_t stride,
+ vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ std::vector<std::complex<NumericT> > input_complex(size * batch_num);
+ std::vector<std::complex<NumericT> > output(size * batch_num);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+
+ fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], out, size * batch_num);
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & out, vcl_size_t size,
+ vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ vcl_size_t row_num = in.internal_size1();
+ vcl_size_t col_num = in.internal_size2() >> 1;
+
+ vcl_size_t size_mat = row_num * col_num;
+
+ std::vector<std::complex<NumericT> > input_complex(size_mat);
+ std::vector<std::complex<NumericT> > output(size_mat);
+
+ NumericT const * data_A = detail::extract_raw_pointer<NumericT>(in);
+ NumericT * data_B = detail::extract_raw_pointer<NumericT>(out);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size_mat);
+
+ fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], data_B, size_mat);
+}
+
+/*
+ * This function performs reorder of 1D input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+ vcl_size_t bits_datasize, vcl_size_t batch_num,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ std::vector<std::complex<NumericT> > input(size * batch_num);
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], in, size * batch_num);
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+ {
+ vcl_size_t batch_id = vcl_size_t(batch_id2);
+ for (vcl_size_t i = 0; i < size; i++)
+ {
+ vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
+ if (i < v)
+ {
+ if (!data_order)
+ {
+ std::complex<NumericT> tmp = input[batch_id * stride + i]; // index
+ input[batch_id * stride + i] = input[batch_id * stride + v]; //index
+ input[batch_id * stride + v] = tmp; //index
+ }
+ else
+ {
+ std::complex<NumericT> tmp = input[i * stride + batch_id]; // index
+ input[i * stride + batch_id] = input[v * stride + batch_id]; //index
+ input[v * stride + batch_id] = tmp; //index
+ }
+ }
+ }
+ }
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], in, size * batch_num);
+}
+
+/*
+ * This function performs reorder of 2D input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+ NumericT * data = detail::extract_raw_pointer<NumericT>(in);
+ vcl_size_t row_num = in.internal_size1();
+ vcl_size_t col_num = in.internal_size2() >> 1;
+ vcl_size_t size_mat = row_num * col_num;
+
+ std::vector<std::complex<NumericT> > input(size_mat);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], data, size_mat);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+ {
+ vcl_size_t batch_id = vcl_size_t(batch_id2);
+ for (vcl_size_t i = 0; i < size; i++)
+ {
+ vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize);
+ if (i < v)
+ {
+ if (!data_order)
+ {
+ std::complex<NumericT> tmp = input[batch_id * stride + i]; // index
+ input[batch_id * stride + i] = input[batch_id * stride + v]; //index
+ input[batch_id * stride + v] = tmp; //index
+ } else
+ {
+ std::complex<NumericT> tmp = input[i * stride + batch_id]; // index
+ input[i * stride + batch_id] = input[v * stride + batch_id]; //index
+ input[v * stride + batch_id] = tmp; //index
+ }
+ }
+ }
+ }
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], data, size_mat);
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ * Kernel for computing smaller amount of data
+ */
+template<typename NumericT>
+void fft_radix2(std::complex<NumericT> * input_complex, vcl_size_t batch_num,
+ vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ NumericT const NUM_PI = NumericT(3.14159265358979323846);
+
+ for (vcl_size_t step = 0; step < bit_size; step++)
+ {
+ vcl_size_t ss = 1 << step;
+ vcl_size_t half_size = size >> 1;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++)
+ {
+ vcl_size_t batch_id = vcl_size_t(batch_id2);
+ for (vcl_size_t tid = 0; tid < half_size; tid++)
+ {
+ vcl_size_t group = (tid & (ss - 1));
+ vcl_size_t pos = ((tid >> step) << (step + 1)) + group;
+ std::complex<NumericT> in1;
+ std::complex<NumericT> in2;
+ vcl_size_t offset;
+ if (!data_order)
+ {
+ offset = batch_id * stride + pos;
+ in1 = input_complex[offset];
+ in2 = input_complex[offset + ss];
+ }
+ else
+ {
+ offset = pos * stride + batch_id;
+ in1 = input_complex[offset];
+ in2 = input_complex[offset + ss * stride];
+ }
+ NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
+ NumericT sn = std::sin(arg);
+ NumericT cs = std::cos(arg);
+ std::complex<NumericT> ex(cs, sn);
+ std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
+ in2.real() * ex.imag() + in2.imag() * ex.real());
+ if (!data_order)
+ input_complex[offset + ss] = in1 - tmp;
+ else
+ input_complex[offset + ss * stride] = in1 - tmp;
+ input_complex[offset] = in1 + tmp;
+ }
+ }
+ }
+
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ * Kernel for computing bigger amount of data
+ */
+template<typename NumericT>
+void fft_radix2_local(std::complex<NumericT> * input_complex,
+ std::complex<NumericT> * lcl_input, vcl_size_t batch_num, vcl_size_t bit_size,
+ vcl_size_t size, vcl_size_t stride, NumericT sign,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ NumericT const NUM_PI = NumericT(3.14159265358979323846);
+
+ for (vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long p2 = 0; p2 < long(size); p2 += 1)
+ {
+ vcl_size_t p = vcl_size_t(p2);
+ vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(p, bit_size);
+
+ if (!data_order)
+ lcl_input[v] = input_complex[batch_id * stride + p]; //index
+ else
+ lcl_input[v] = input_complex[p * stride + batch_id];
+ }
+
+ for (vcl_size_t s = 0; s < bit_size; s++)
+ {
+ vcl_size_t ss = 1 << s;
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long tid2 = 0; tid2 < long(size)/2; tid2++)
+ {
+ vcl_size_t tid = vcl_size_t(tid2);
+ vcl_size_t group = (tid & (ss - 1));
+ vcl_size_t pos = ((tid >> s) << (s + 1)) + group;
+
+ std::complex<NumericT> in1 = lcl_input[pos];
+ std::complex<NumericT> in2 = lcl_input[pos + ss];
+
+ NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss);
+
+ NumericT sn = std::sin(arg);
+ NumericT cs = std::cos(arg);
+ std::complex<NumericT> ex(cs, sn);
+
+ std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(),
+ in2.real() * ex.imag() + in2.imag() * ex.real());
+
+ lcl_input[pos + ss] = in1 - tmp;
+ lcl_input[pos] = in1 + tmp;
+ }
+
+ }
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ //copy local array back to global memory
+ for (long p2 = 0; p2 < long(size); p2 += 1)
+ {
+ vcl_size_t p = vcl_size_t(p2);
+ if (!data_order)
+ input_complex[batch_id * stride + p] = lcl_input[p];
+ else
+ input_complex[p * stride + batch_id] = lcl_input[p];
+
+ }
+
+ }
+
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride,
+ vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+ vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
+
+ std::vector<std::complex<NumericT> > input_complex(size * batch_num);
+ std::vector<std::complex<NumericT> > lcl_input(size * batch_num);
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+
+ if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
+ {
+ viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
+ }
+ else
+ {
+ viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num);
+ viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
+ }
+
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size * batch_num);
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, vcl_size_t size,
+ vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+ vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size);
+
+ NumericT * data = detail::extract_raw_pointer<NumericT>(in);
+
+ vcl_size_t row_num = in.internal_size1();
+ vcl_size_t col_num = in.internal_size2() >> 1;
+ vcl_size_t size_mat = row_num * col_num;
+
+ std::vector<std::complex<NumericT> > input_complex(size_mat);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
+ if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM)
+ {
+ //std::cout<<bit_size<<","<<size<<","<<stride<<","<<batch_num<<","<<size<<","<<sign<<","<<data_order<<std::endl;
+ std::vector<std::complex<NumericT> > lcl_input(size_mat);
+ viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order);
+ }
+ else
+ {
+ viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order);
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat);
+ viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order);
+ }
+
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size_mat);
+
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently, Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV>& in, viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/)
+{
+
+ vcl_size_t size = in.size() >> 1;
+ vcl_size_t ext_size = viennacl::linalg::host_based::detail::fft::next_power_2(2 * size - 1);
+
+ viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+ std::vector<std::complex<NumericT> > input_complex(size);
+ std::vector<std::complex<NumericT> > output_complex(size);
+
+ std::vector<std::complex<NumericT> > A_complex(ext_size);
+ std::vector<std::complex<NumericT> > B_complex(ext_size);
+ std::vector<std::complex<NumericT> > Z_complex(ext_size);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size);
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < long(ext_size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ A_complex[i] = 0;
+ B_complex[i] = 0;
+ }
+
+ vcl_size_t double_size = size << 1;
+
+ NumericT const NUM_PI = NumericT(3.14159265358979323846);
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ vcl_size_t rm = i * i % (double_size);
+ NumericT angle = NumericT(rm) / NumericT(size) * NumericT(NUM_PI);
+
+ NumericT sn_a = std::sin(-angle);
+ NumericT cs_a = std::cos(-angle);
+
+ std::complex<NumericT> a_i(cs_a, sn_a);
+ std::complex<NumericT> b_i(cs_a, -sn_a);
+
+ A_complex[i] = std::complex<NumericT>(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(),
+ input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real());
+ B_complex[i] = b_i;
+
+ // very bad instruction, to be fixed
+ if (i)
+ B_complex[ext_size - i] = b_i;
+ }
+
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size);
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&A_complex[0], A, ext_size);
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&B_complex[0], B, ext_size);
+
+ viennacl::linalg::convolve_i(A, B, Z);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&Z_complex[0], Z, ext_size);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ vcl_size_t rm = i * i % (double_size);
+ NumericT angle = NumericT(rm) / NumericT(size) * NumericT(-NUM_PI);
+ NumericT sn_a = std::sin(angle);
+ NumericT cs_a = std::cos(angle);
+ std::complex<NumericT> b_i(cs_a, sn_a);
+ output_complex[i] = std::complex<NumericT>(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(),
+ Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real());
+ }
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], out, size);
+
+}
+
+/**
+ * @brief Normalize vector with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+ vcl_size_t size = input.size() >> 1;
+ NumericT norm_factor = static_cast<NumericT>(size);
+ for (vcl_size_t i = 0; i < size * 2; i++)
+ input[i] /= norm_factor;
+
+}
+
+/**
+ * @brief Complex multiplikation of two vectors
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+ viennacl::vector<NumericT, AlignmentV> const & input2,
+ viennacl::vector<NumericT, AlignmentV> & output)
+{
+ vcl_size_t size = input1.size() >> 1;
+
+ std::vector<std::complex<NumericT> > input1_complex(size);
+ std::vector<std::complex<NumericT> > input2_complex(size);
+ std::vector<std::complex<NumericT> > output_complex(size);
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input1_complex[0], input1, size);
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input2_complex[0], input2, size);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ std::complex<NumericT> in1 = input1_complex[i];
+ std::complex<NumericT> in2 = input2_complex[i];
+ output_complex[i] = std::complex<NumericT>(in1.real() * in2.real() - in1.imag() * in2.imag(),
+ in1.real() * in2.imag() + in1.imag() * in2.real());
+ }
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], output, size);
+
+}
+/**
+ * @brief Inplace transpose of matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+ vcl_size_t row_num = input.internal_size1() / 2;
+ vcl_size_t col_num = input.internal_size2() / 2;
+
+ vcl_size_t size = row_num * col_num;
+
+ NumericT * data = detail::extract_raw_pointer<NumericT>(input);
+
+ std::vector<std::complex<NumericT> > input_complex(size);
+
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size);
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ vcl_size_t row = i / col_num;
+ vcl_size_t col = i - row * col_num;
+ vcl_size_t new_pos = col * row_num + row;
+
+ if (i < new_pos)
+ {
+ std::complex<NumericT> val = input_complex[i];
+ input_complex[i] = input_complex[new_pos];
+ input_complex[new_pos] = val;
+ }
+ }
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size);
+
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+
+ vcl_size_t row_num = input.internal_size1() / 2;
+ vcl_size_t col_num = input.internal_size2() / 2;
+ vcl_size_t size = row_num * col_num;
+
+ NumericT const * data_A = detail::extract_raw_pointer<NumericT>(input);
+ NumericT * data_B = detail::extract_raw_pointer<NumericT>(output);
+
+ std::vector<std::complex<NumericT> > input_complex(size);
+ viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size);
+
+ std::vector<std::complex<NumericT> > output_complex(size);
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ vcl_size_t row = i / col_num;
+ vcl_size_t col = i % col_num;
+ vcl_size_t new_pos = col * row_num + row;
+ output_complex[new_pos] = input_complex[i];
+ }
+ viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], data_B, size);
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ NumericT const * data_in = detail::extract_raw_pointer<NumericT>(in);
+ NumericT * data_out = detail::extract_raw_pointer<NumericT>(out);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = static_cast<vcl_size_t>(i2);
+ data_out[2*i ] = data_in[i];
+ data_out[2*i+1] = NumericT(0);
+ }
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ NumericT const * data_in = detail::extract_raw_pointer<NumericT>(in);
+ NumericT * data_out = detail::extract_raw_pointer<NumericT>(out);
+
+#ifdef VIENNACL_WITH_OPENMP
+#pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < long(size); i++)
+ data_out[i] = data_in[2*i];
+}
+
+/**
+ * @brief Reverse vector to opposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT> & in)
+{
+ vcl_size_t size = in.size();
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i2 = 0; i2 < long(size); i2++)
+ {
+ vcl_size_t i = vcl_size_t(i2);
+ NumericT val1 = in[i];
+ NumericT val2 = in[size - i - 1];
+ in[i] = val2;
+ in[size - i - 1] = val1;
+ }
+}
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
new file mode 100644
index 0000000..62c885a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp
@@ -0,0 +1,672 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/ilu_operations.hpp
+ @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using the host (OpenMP)
+*/
+
+#include <cmath>
+#include <algorithm> //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_ILU_MIN_SIZE
+ #define VIENNACL_OPENMP_ILU_MIN_SIZE 5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ // L is known to have correct dimensions
+
+ unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+ NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
+
+ unsigned int *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+
+ //
+ // Step 1: Count elements in L
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = A_row_buffer[row];
+ unsigned int col_end = A_row_buffer[row+1];
+
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ {
+ unsigned int col = A_col_buffer[j];
+ if (long(col) <= row)
+ ++L_row_buffer[row];
+ }
+ }
+
+ //
+ // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
+ //
+ viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+ unsigned int *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+ NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
+
+ //
+ // Step 3: Write entries:
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = A_row_buffer[row];
+ unsigned int col_end = A_row_buffer[row+1];
+
+ unsigned int index_L = L_row_buffer[row];
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ {
+ unsigned int col = A_col_buffer[j];
+ NumericT value = A_elements[j];
+
+ if (long(col) <= row)
+ {
+ L_col_buffer[index_L] = col;
+ L_elements[index_L] = value;
+ ++index_L;
+ }
+ }
+ }
+
+} // extract_L
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+ NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
+
+ NumericT *D_elements = detail::extract_raw_pointer<NumericT>(D.handle());
+
+ //
+ // Step 1: Determine D
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = A_row_buffer[row];
+ unsigned int col_end = A_row_buffer[row+1];
+
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ {
+ unsigned int col = A_col_buffer[j];
+ if (row == col)
+ {
+ D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
+ break;
+ }
+ }
+ }
+
+ //
+ // Step 2: Scale values in L:
+ //
+ unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+ unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+ NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = L_row_buffer[row];
+ unsigned int col_end = L_row_buffer[row+1];
+
+ NumericT D_row = D_elements[row];
+
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
+ }
+
+ L.generate_row_block_information();
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC using OpenMP (cf. Algorithm 3 in paper, but for L rather than U) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> & aij_L)
+{
+ unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+ unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+ NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
+
+ NumericT *aij_ptr = detail::extract_raw_pointer<NumericT>(aij_L.handle());
+
+ // temporary workspace
+ NumericT *L_backup = (NumericT *)malloc(sizeof(NumericT) * L.nnz());
+
+ // backup:
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
+ L_backup[i] = L_elements[i];
+
+
+ // sweep
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(L.size1()); ++row)
+ {
+ //
+ // update L:
+ //
+ unsigned int row_Li_start = L_row_buffer[row];
+ unsigned int row_Li_end = L_row_buffer[row + 1];
+
+ for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
+ {
+ unsigned int col = L_col_buffer[i];
+
+ unsigned int row_Lj_start = L_row_buffer[col];
+ unsigned int row_Lj_end = L_row_buffer[col+1];
+
+ // compute \sum_{k=1}^{j-1} l_ik l_jk
+ unsigned int index_Lj = row_Lj_start;
+ unsigned int col_Lj = L_col_buffer[index_Lj];
+ NumericT s = aij_ptr[i];
+ for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
+ {
+ unsigned int col_Li = L_col_buffer[index_Li];
+
+ // find element in row j
+ while (col_Lj < col_Li)
+ {
+ ++index_Lj;
+ col_Lj = L_col_buffer[index_Lj];
+ }
+
+ if (col_Lj == col_Li)
+ s -= L_backup[index_Li] * L_backup[index_Lj];
+ }
+
+ if (row != col)
+ L_elements[i] = s / L_backup[row_Lj_end - 1]; // diagonal element is last in row!
+ else
+ L_elements[i] = std::sqrt(s);
+ }
+ }
+
+ free(L_backup);
+}
+
+
+
+//////////////////////// ILU ////////////////////////
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ // L and U are known to have correct dimensions
+
+ unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+ NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
+
+ unsigned int *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+ unsigned int *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+
+ //
+ // Step 1: Count elements in L and U
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = A_row_buffer[row];
+ unsigned int col_end = A_row_buffer[row+1];
+
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ {
+ unsigned int col = A_col_buffer[j];
+ if (long(col) <= row)
+ ++L_row_buffer[row];
+ if (long(col) >= row)
+ ++U_row_buffer[row];
+ }
+ }
+
+ //
+ // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices
+ //
+ viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+ viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), U.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_U_row_buffer);
+ U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+ unsigned int *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+ NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
+
+ unsigned int *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+ NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U.handle());
+
+ //
+ // Step 3: Write entries:
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = A_row_buffer[row];
+ unsigned int col_end = A_row_buffer[row+1];
+
+ unsigned int index_L = L_row_buffer[row];
+ unsigned int index_U = U_row_buffer[row];
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ {
+ unsigned int col = A_col_buffer[j];
+ NumericT value = A_elements[j];
+
+ if (long(col) <= row)
+ {
+ L_col_buffer[index_L] = col;
+ L_elements[index_L] = value;
+ ++index_L;
+ }
+
+ if (long(col) >= row)
+ {
+ U_col_buffer[index_U] = col;
+ U_elements[index_U] = value;
+ ++index_U;
+ }
+ }
+ }
+
+} // extract_LU
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2());
+ NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle());
+
+ NumericT *D_elements = detail::extract_raw_pointer<NumericT>(D.handle());
+
+ //
+ // Step 1: Determine D
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = A_row_buffer[row];
+ unsigned int col_end = A_row_buffer[row+1];
+
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ {
+ unsigned int col = A_col_buffer[j];
+ if (row == col)
+ {
+ D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j]));
+ break;
+ }
+ }
+ }
+
+ //
+ // Step 2: Scale values in L:
+ //
+ unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+ unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+ NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = L_row_buffer[row];
+ unsigned int col_end = L_row_buffer[row+1];
+
+ NumericT D_row = D_elements[row];
+
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ L_elements[j] *= D_row * D_elements[L_col_buffer[j]];
+ }
+
+ //
+ // Step 3: Scale values in U:
+ //
+ unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1());
+ unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2());
+ NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A.size1()); ++row)
+ {
+ unsigned int col_begin = U_row_buffer[row];
+ unsigned int col_end = U_row_buffer[row+1];
+
+ NumericT D_row = D_elements[row];
+
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ U_elements[j] *= D_row * D_elements[U_col_buffer[j]];
+ }
+
+ L.generate_row_block_information();
+ // Note: block information for U will be generated after transposition
+
+}
+
+template<typename NumericT>
+void ilu_transpose(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & B)
+{
+ NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ // initialize datastructures for B:
+ B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+ NumericT * B_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle());
+ unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1());
+ unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2());
+
+ // prepare uninitialized B_row_buffer:
+ for (std::size_t i = 0; i < B.size1(); ++i)
+ B_row_buffer[i] = 0;
+
+ //
+ // Stage 1: Compute pattern for B
+ //
+ for (std::size_t row = 0; row < A.size1(); ++row)
+ {
+ unsigned int row_start = A_row_buffer[row];
+ unsigned int row_stop = A_row_buffer[row+1];
+
+ for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+ B_row_buffer[A_col_buffer[nnz_index]] += 1;
+ }
+
+ // Bring row-start array in place using exclusive-scan:
+ unsigned int offset = B_row_buffer[0];
+ B_row_buffer[0] = 0;
+ for (std::size_t row = 1; row < B.size1(); ++row)
+ {
+ unsigned int tmp = B_row_buffer[row];
+ B_row_buffer[row] = offset;
+ offset += tmp;
+ }
+ B_row_buffer[B.size1()] = offset;
+
+ //
+ // Stage 2: Fill with data
+ //
+
+ std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row
+
+ for (unsigned int row = 0; row < static_cast<unsigned int>(A.size1()); ++row)
+ {
+ //std::cout << "Row " << row << ": ";
+ unsigned int row_start = A_row_buffer[row];
+ unsigned int row_stop = A_row_buffer[row+1];
+
+ for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
+ {
+ unsigned int col_in_A = A_col_buffer[nnz_index];
+ unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A];
+ B_col_buffer[B_nnz_index] = row;
+ B_elements[B_nnz_index] = A_elements[nnz_index];
+ ++B_row_offsets[col_in_A];
+ //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index];
+ }
+ }
+
+ // Step 3: Make datastructure consistent (row blocks!)
+ B.generate_row_block_information();
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L,
+ compressed_matrix<NumericT> & U_trans,
+ vector<NumericT> const & aij_U_trans)
+{
+ unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1());
+ unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2());
+ NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle());
+
+ NumericT const *aij_L_ptr = detail::extract_raw_pointer<NumericT>(aij_L.handle());
+
+ unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle1());
+ unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle2());
+ NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U_trans.handle());
+
+ NumericT const *aij_U_trans_ptr = detail::extract_raw_pointer<NumericT>(aij_U_trans.handle());
+
+ // temporary workspace
+ NumericT *L_backup = new NumericT[L.nnz()];
+ NumericT *U_backup = new NumericT[U_trans.nnz()];
+
+ // backup:
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(L.nnz()); ++i)
+ L_backup[i] = L_elements[i];
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (U_trans.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(U_trans.nnz()); ++i)
+ U_backup[i] = U_elements[i];
+
+ // sweep
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(L.size1()); ++row)
+ {
+ //
+ // update L:
+ //
+ unsigned int row_L_start = L_row_buffer[row];
+ unsigned int row_L_end = L_row_buffer[row + 1];
+
+ for (unsigned int j = row_L_start; j < row_L_end; ++j)
+ {
+ unsigned int col = L_col_buffer[j];
+
+ if (col == row)
+ continue;
+
+ unsigned int row_U_start = U_row_buffer[col];
+ unsigned int row_U_end = U_row_buffer[col + 1];
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ unsigned int index_U = row_U_start;
+ unsigned int col_U = (index_U < row_U_end) ? U_col_buffer[index_U] : static_cast<unsigned int>(U_trans.size2());
+ NumericT sum = 0;
+ for (unsigned int k = row_L_start; k < j; ++k)
+ {
+ unsigned int col_L = L_col_buffer[k];
+
+ // find element in U
+ while (col_U < col_L)
+ {
+ ++index_U;
+ col_U = U_col_buffer[index_U];
+ }
+
+ if (col_U == col_L)
+ sum += L_backup[k] * U_backup[index_U];
+ }
+
+ // update l_ij:
+ assert(U_col_buffer[row_U_end - 1] == col && bool("Accessing U element which is not a diagonal element!"));
+ L_elements[j] = (aij_L_ptr[j] - sum) / U_backup[row_U_end - 1]; // diagonal element is last entry in U
+ }
+
+
+ //
+ // update U:
+ //
+ unsigned int row_U_start = U_row_buffer[row];
+ unsigned int row_U_end = U_row_buffer[row + 1];
+ for (unsigned int j = row_U_start; j < row_U_end; ++j)
+ {
+ unsigned int col = U_col_buffer[j];
+
+ row_L_start = L_row_buffer[col];
+ row_L_end = L_row_buffer[col + 1];
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ unsigned int index_L = row_L_start;
+ unsigned int col_L = (index_L < row_L_end) ? L_col_buffer[index_L] : static_cast<unsigned int>(L.size1());
+ NumericT sum = 0;
+ for (unsigned int k = row_U_start; k < j; ++k)
+ {
+ unsigned int col_U = U_col_buffer[k];
+
+ // find element in L
+ while (col_L < col_U)
+ {
+ ++index_L;
+ col_L = L_col_buffer[index_L];
+ }
+
+ if (col_U == col_L)
+ sum += L_backup[index_L] * U_backup[k];
+ }
+
+ // update u_ij:
+ U_elements[j] = aij_U_trans_ptr[j] - sum;
+ }
+ }
+
+ delete[] L_backup;
+ delete[] U_backup;
+}
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+ vector<NumericT> & diag_R)
+{
+ unsigned int *R_row_buffer = detail::extract_raw_pointer<unsigned int>(R.handle1());
+ unsigned int *R_col_buffer = detail::extract_raw_pointer<unsigned int>(R.handle2());
+ NumericT *R_elements = detail::extract_raw_pointer<NumericT>(R.handle());
+
+ NumericT *diag_R_ptr = detail::extract_raw_pointer<NumericT>(diag_R.handle());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (R.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(R.size1()); ++row)
+ {
+ unsigned int col_begin = R_row_buffer[row];
+ unsigned int col_end = R_row_buffer[row+1];
+
+ // part 1: extract diagonal entry
+ NumericT diag = 0;
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ {
+ unsigned int col = R_col_buffer[j];
+ if (col == row)
+ {
+ diag = R_elements[j];
+ R_elements[j] = 0; // (I - D^{-1}R)
+ break;
+ }
+ }
+ diag_R_ptr[row] = diag;
+
+ assert((diag > 0 || diag < 0) && bool("Zero diagonal detected!"));
+
+ // part2: scale
+ for (unsigned int j = col_begin; j < col_end; ++j)
+ R_elements[j] /= -diag;
+ }
+
+ //std::cout << "diag_R: " << diag_R << std::endl;
+}
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
[27/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
new file mode 100644
index 0000000..8361308
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/amg/amg_base.hpp
@@ -0,0 +1,208 @@
+#ifndef VIENNACL_LINALG_DETAIL_AMG_AMG_BASE_HPP_
+#define VIENNACL_LINALG_DETAIL_AMG_AMG_BASE_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file amg_base.hpp
+ @brief Helper classes and functions for the AMG preconditioner. Experimental.
+
+ AMG code contributed by Markus Wagner
+*/
+
+#include <cmath>
+#include <set>
+#include <list>
+#include <stdexcept>
+#include <algorithm>
+
+#include <map>
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+#include "viennacl/context.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Enumeration of coarsening methods for algebraic multigrid. */
+enum amg_coarsening_method
+{
+ AMG_COARSENING_METHOD_ONEPASS = 1,
+ AMG_COARSENING_METHOD_AGGREGATION,
+ AMG_COARSENING_METHOD_MIS2_AGGREGATION
+};
+
+/** @brief Enumeration of interpolation methods for algebraic multigrid. */
+enum amg_interpolation_method
+{
+ AMG_INTERPOLATION_METHOD_DIRECT = 1,
+ AMG_INTERPOLATION_METHOD_AGGREGATION,
+ AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION
+};
+
+
+/** @brief A tag for algebraic multigrid (AMG). Used to transport information from the user to the implementation.
+*/
+class amg_tag
+{
+public:
+ /** @brief The constructor, setting default values for the various parameters.
+ *
+ * Default coarsening routine: Aggreggation based on maximum independent sets of distance (MIS-2)
+ * Default interpolation routine: Smoothed aggregation
+ * Default threshold for strong connections: 0.1 (customizations are recommeded!)
+ * Default weight for Jacobi smoother: 1.0
+ * Default number of pre-smooth operations: 2
+ * Default number of post-smooth operations: 2
+ * Default number of coarse levels: 0 (this indicates that as many coarse levels as needed are constructed until the cutoff is reached)
+ * Default coarse grid size for direct solver (coarsening cutoff): 50
+ */
+ amg_tag()
+ : coarsening_method_(AMG_COARSENING_METHOD_MIS2_AGGREGATION), interpolation_method_(AMG_INTERPOLATION_METHOD_AGGREGATION),
+ strong_connection_threshold_(0.1), jacobi_weight_(1.0),
+ presmooth_steps_(2), postsmooth_steps_(2),
+ coarse_levels_(0), coarse_cutoff_(50) {}
+
+ // Getter-/Setter-Functions
+ /** @brief Sets the strategy used for constructing coarse grids */
+ void set_coarsening_method(amg_coarsening_method s) { coarsening_method_ = s; }
+ /** @brief Returns the current coarsening strategy */
+ amg_coarsening_method get_coarsening_method() const { return coarsening_method_; }
+
+ /** @brief Sets the interpolation method to the provided method */
+ void set_interpolation_method(amg_interpolation_method interpol) { interpolation_method_ = interpol; }
+ /** @brief Returns the current interpolation method */
+ amg_interpolation_method get_interpolation_method() const { return interpolation_method_; }
+
+ /** @brief Sets the strong connection threshold. Customizations by the user essential for best results!
+ *
+ * With classical interpolation, a connection is considered strong if |a_ij| >= threshold * max_k(|a_ik|)
+ * Strength of connection currently ignored for aggregation-based coarsening (to be added in the future).
+ */
+ void set_strong_connection_threshold(double threshold) { if (threshold > 0) strong_connection_threshold_ = threshold; }
+ /** @brief Returns the strong connection threshold parameter.
+ *
+ * @see set_strong_connection_threshold() for an explanation of the threshold parameter
+ */
+ double get_strong_connection_threshold() const { return strong_connection_threshold_; }
+
+ /** @brief Sets the weight (damping) for the Jacobi smoother.
+ *
+ * The optimal value depends on the problem at hand. Values of 0.67 or 1.0 are usually a good starting point for further experiments.
+ */
+ void set_jacobi_weight(double w) { if (w > 0) jacobi_weight_ = w; }
+ /** @brief Returns the Jacobi smoother weight (damping). */
+ double get_jacobi_weight() const { return jacobi_weight_; }
+
+ /** @brief Sets the number of smoother applications on the fine level before restriction to the coarser level. */
+ void set_presmooth_steps(vcl_size_t steps) { presmooth_steps_ = steps; }
+ /** @brief Returns the number of smoother applications on the fine level before restriction to the coarser level. */
+ vcl_size_t get_presmooth_steps() const { return presmooth_steps_; }
+
+ /** @brief Sets the number of smoother applications on the coarse level before interpolation to the finer level. */
+ void set_postsmooth_steps(vcl_size_t steps) { postsmooth_steps_ = steps; }
+ /** @brief Returns the number of smoother applications on the coarse level before interpolation to the finer level. */
+ vcl_size_t get_postsmooth_steps() const { return postsmooth_steps_; }
+
+ /** @brief Sets the number of coarse levels. If set to zero, then coarse levels are constructed until the cutoff size is reached. */
+ void set_coarse_levels(vcl_size_t levels) { coarse_levels_ = levels; }
+ /** @brief Returns the number of coarse levels. If zero, then coarse levels are constructed until the cutoff size is reached. */
+ vcl_size_t get_coarse_levels() const { return coarse_levels_; }
+
+ /** @brief Sets the coarse grid size for which the recursive multigrid scheme is stopped and a direct solver is used. */
+ void set_coarsening_cutoff(vcl_size_t size) { coarse_cutoff_ = size; }
+ /** @brief Returns the coarse grid size for which the recursive multigrid scheme is stopped and a direct solver is used. */
+ vcl_size_t get_coarsening_cutoff() const { return coarse_cutoff_; }
+
+ /** @brief Sets the ViennaCL context for the setup stage. Set this to a host context if you want to run the setup on the host.
+ *
+ * Set the ViennaCL context for the solver application via set_target_context().
+ * Target and setup context can be different.
+ */
+ void set_setup_context(viennacl::context ctx) { setup_ctx_ = ctx; }
+ /** @brief Returns the ViennaCL context for the preconditioenr setup. */
+ viennacl::context const & get_setup_context() const { return setup_ctx_; }
+
+ /** @brief Sets the ViennaCL context for the solver cycle stage (i.e. preconditioner applications).
+ *
+ * Since the cycle stage easily benefits from accelerators, you usually want to set this to a CUDA or OpenCL-enabled context.
+ */
+ void set_target_context(viennacl::context ctx) { target_ctx_ = ctx; }
+ /** @brief Returns the ViennaCL context for the solver cycle stage (i.e. preconditioner applications). */
+ viennacl::context const & get_target_context() const { return target_ctx_; }
+
+private:
+ amg_coarsening_method coarsening_method_;
+ amg_interpolation_method interpolation_method_;
+ double strong_connection_threshold_, jacobi_weight_;
+ vcl_size_t presmooth_steps_, postsmooth_steps_, coarse_levels_, coarse_cutoff_;
+ viennacl::context setup_ctx_, target_ctx_;
+};
+
+
+namespace detail
+{
+namespace amg
+{
+
+
+ struct amg_level_context
+ {
+ void resize(vcl_size_t num_points, vcl_size_t max_nnz)
+ {
+ influence_jumper_.resize(num_points + 1, false);
+ influence_ids_.resize(max_nnz, false);
+ influence_values_.resize(num_points, false);
+ point_types_.resize(num_points, false);
+ coarse_id_.resize(num_points, false);
+ }
+
+ void switch_context(viennacl::context ctx)
+ {
+ influence_jumper_.switch_memory_context(ctx);
+ influence_ids_.switch_memory_context(ctx);
+ influence_values_.switch_memory_context(ctx);
+ point_types_.switch_memory_context(ctx);
+ coarse_id_.switch_memory_context(ctx);
+ }
+
+ enum
+ {
+ POINT_TYPE_UNDECIDED = 0,
+ POINT_TYPE_COARSE,
+ POINT_TYPE_FINE
+ } amg_point_types;
+
+ viennacl::vector<unsigned int> influence_jumper_; // similar to row_buffer for CSR matrices
+ viennacl::vector<unsigned int> influence_ids_; // IDs of influencing points
+ viennacl::vector<unsigned int> influence_values_; // Influence measure for each point
+ viennacl::vector<unsigned int> point_types_; // 0: undecided, 1: coarse point, 2: fine point. Using char here because type for enum might be a larger type
+ viennacl::vector<unsigned int> coarse_id_; // coarse ID used on the next level. Only valid for coarse points. Fine points may (ab)use their entry for something else.
+ unsigned int num_coarse_;
+ };
+
+
+} //namespace amg
+}
+}
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..8308f77
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
@@ -0,0 +1,191 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_KERNEL_CALLS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp
+ @brief Kernel calls for the bisection algorithm
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#ifdef VIENNACL_WITH_OPENCL
+ #include "viennacl/linalg/opencl/bisect_kernel_calls.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ #include "viennacl/linalg/cuda/bisect_kernel_calls.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+ template<typename NumericT>
+ void bisectSmall(const InputData<NumericT> &input, ResultDataSmall<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+ {
+ switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::bisectSmall(input, result,
+ mat_size,
+ lg,ug,
+ precision);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::bisectSmall(input, result,
+ mat_size,
+ lg,ug,
+ precision);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge(const InputData<NumericT> &input, ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+ {
+ switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::bisectLarge(input, result,
+ mat_size,
+ lg,ug,
+ precision);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::bisectLarge(input, result,
+ mat_size,
+ lg,ug,
+ precision);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge_OneIntervals(const InputData<NumericT> &input, ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT precision)
+ {
+ switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::bisectLargeOneIntervals(input, result,
+ mat_size,
+ precision);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::bisectLarge_OneIntervals(input, result,
+ mat_size,
+ precision);
+
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+
+
+
+
+ template<typename NumericT>
+ void bisectLarge_MultIntervals(const InputData<NumericT> &input, ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT precision)
+ {
+ switch (viennacl::traits::handle(input.g_a).get_active_handle_id())
+ {
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::bisectLargeMultIntervals(input, result,
+ mat_size,
+ precision);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::bisectLarge_MultIntervals(input, result,
+ mat_size,
+ precision);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+ }
+} // namespace detail
+} // namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
new file mode 100755
index 0000000..337858f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_large.hpp
@@ -0,0 +1,142 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_BISECT_LARGE_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_BISECT_LARGE_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/bisect_large.hpp
+ @brief Computation of eigenvalues of a large symmetric, tridiagonal matrix
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+// includes, system
+#include <iostream>
+#include <iomanip>
+#include <stdlib.h>
+#include <stdio.h>
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run the kernels to compute the eigenvalues for large matrices
+//! @param input handles to input data
+//! @param result handles to result data
+//! @param mat_size matrix size
+//! @param precision desired precision of eigenvalues
+//! @param lg lower limit of Gerschgorin interval
+//! @param ug upper limit of Gerschgorin interval
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+computeEigenvaluesLargeMatrix(InputData<NumericT> &input, ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug, const NumericT precision)
+{
+
+
+ // First kernel call: decide on which intervals bisect_Large_OneIntervals/
+ // bisect_Large_MultIntervals is executed
+ viennacl::linalg::detail::bisectLarge(input, result, mat_size, lg, ug, precision);
+
+ // compute eigenvalues for intervals that contained only one eigenvalue
+ // after the first processing step
+ viennacl::linalg::detail::bisectLarge_OneIntervals(input, result, mat_size, precision);
+
+ // process intervals that contained more than one eigenvalue after
+ // the first processing step
+ viennacl::linalg::detail::bisectLarge_MultIntervals(input, result, mat_size, precision);
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Process the result, that is obtain result from device and do simple sanity
+//! checking
+//! @param result handles to result data
+//! @param mat_size matrix size
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+bool
+processResultDataLargeMatrix(ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size)
+{
+ bool bCompareResult = true;
+ // copy data from intervals that contained more than one eigenvalue after
+ // the first processing step
+ std::vector<NumericT> lambda_mult(mat_size);
+ viennacl::copy(result.g_lambda_mult, lambda_mult);
+
+ std::vector<unsigned int> pos_mult(mat_size);
+ viennacl::copy(result.g_pos_mult, pos_mult);
+
+ std::vector<unsigned int> blocks_mult_sum(mat_size);
+ viennacl::copy(result.g_blocks_mult_sum, blocks_mult_sum);
+
+ unsigned int num_one_intervals = result.g_num_one;
+ unsigned int sum_blocks_mult = mat_size - num_one_intervals;
+
+
+ // copy data for intervals that contained one eigenvalue after the first
+ // processing step
+ std::vector<NumericT> left_one(mat_size);
+ std::vector<NumericT> right_one(mat_size);
+ std::vector<unsigned int> pos_one(mat_size);
+
+ viennacl::copy(result.g_left_one, left_one);
+ viennacl::copy(result.g_right_one, right_one);
+ viennacl::copy(result.g_pos_one, pos_one);
+
+
+ // singleton intervals generated in the second step
+ for (unsigned int i = 0; i < sum_blocks_mult; ++i)
+ {
+ if (pos_mult[i] != 0)
+ result.std_eigenvalues[pos_mult[i] - 1] = lambda_mult[i];
+
+ else
+ {
+ throw memory_exception("Invalid array index! Are there more than 256 equal eigenvalues?");
+ }
+ }
+
+ // singleton intervals generated in the first step
+ unsigned int index = 0;
+
+ for (unsigned int i = 0; i < num_one_intervals; ++i, ++index)
+ {
+ result.std_eigenvalues[pos_one[i] - 1] = left_one[i];
+ }
+ return bCompareResult;
+}
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif //VIENNACL_LINALG_DETAIL_BISECT_LARGE_HPP_
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
new file mode 100755
index 0000000..144640b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/bisect_small.hpp
@@ -0,0 +1,96 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_SMALL_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_SMALL_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/bisect_small.hpp
+ @brief Computation of eigenvalues of a small symmetric, tridiagonal matrix
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+
+// includes, project
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+
+// includes, kernels
+#include "viennacl/linalg/detail/bisect/bisect_kernel_calls.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Determine eigenvalues for matrices smaller than MAX_SMALL_MATRIX
+//! @param input handles to input data of kernel
+//! @param result handles to result of kernel
+//! @param mat_size matrix size
+//! @param lg lower limit of Gerschgorin interval
+//! @param ug upper limit of Gerschgorin interval
+//! @param precision desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+computeEigenvaluesSmallMatrix(const InputData<NumericT> &input, ResultDataSmall<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+{
+ viennacl::linalg::detail::bisectSmall( input, result, mat_size, lg, ug, precision);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Process the result obtained on the device, that is transfer to host and
+//! perform basic sanity checking
+//! @param result handles to result data
+//! @param mat_size matrix size
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+void
+processResultSmallMatrix(ResultDataSmall<NumericT> &result,
+ const unsigned int mat_size)
+{
+ // copy data back to host
+ std::vector<NumericT> left(mat_size);
+ std::vector<unsigned int> left_count(mat_size);
+
+ viennacl::copy(result.vcl_g_left, left);
+ viennacl::copy(result.vcl_g_left_count, left_count);
+
+ for (unsigned int i = 0; i < mat_size; ++i)
+ {
+ result.std_eigenvalues[left_count[i]] = left[i];
+ }
+}
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
new file mode 100755
index 0000000..3afa509
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/config.hpp
@@ -0,0 +1,44 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_CONFIG_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_CONFIG_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+
+/** @file viennacl/linalg/detail//bisect/config.hpp
+ * @brief Global configuration parameters
+ *
+ * Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ * the creation of derivative works is allowed by including the following statement:
+ * "This software contains source code provided by NVIDIA Corporation."
+ * */
+
+// should be power of two
+#define VIENNACL_BISECT_MAX_THREADS_BLOCK 256
+
+#ifdef VIENNACL_WITH_OPENCL
+# define VIENNACL_BISECT_MAX_SMALL_MATRIX 256
+# define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX 256
+#else // if CUDA is used
+# define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX 512 // change to 256 if errors occur
+# define VIENNACL_BISECT_MAX_SMALL_MATRIX 512 // change to 256 if errors occur
+#endif
+
+ #define VIENNACL_BISECT_MIN_ABS_INTERVAL 5.0e-37
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
new file mode 100755
index 0000000..53cd863
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/gerschgorin.hpp
@@ -0,0 +1,94 @@
+#ifndef _VIENNACL_LINALG_DETAIL_BISECT_GERSCHORIN_HPP_
+#define _VIENNACL_LINALG_DETAIL_BISECT_GERSCHORIN_HPP_
+
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/gerschgorin.hpp
+ @brief Computation of Gerschgorin interval for symmetric, tridiagonal matrix
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <cfloat>
+
+#include "viennacl/linalg/detail/bisect/util.hpp"
+#include "viennacl/vector.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+ ////////////////////////////////////////////////////////////////////////////////
+ //! Compute Gerschgorin interval for symmetric, tridiagonal matrix
+ //! @param d diagonal elements
+ //! @param s superdiagonal elements
+ //! @param n size of matrix
+ //! @param lg lower limit of Gerschgorin interval
+ //! @param ug upper limit of Gerschgorin interval
+ ////////////////////////////////////////////////////////////////////////////////
+ template<typename NumericT>
+ void
+ computeGerschgorin(std::vector<NumericT> & d, std::vector<NumericT> & s, unsigned int n, NumericT &lg, NumericT &ug)
+ {
+ // compute bounds
+ for (unsigned int i = 1; i < (n - 1); ++i)
+ {
+
+ // sum over the absolute values of all elements of row i
+ NumericT sum_abs_ni = fabsf(s[i]) + fabsf(s[i + 1]);
+
+ lg = min(lg, d[i] - sum_abs_ni);
+ ug = max(ug, d[i] + sum_abs_ni);
+ }
+
+ // first and last row, only one superdiagonal element
+
+ // first row
+ lg = min(lg, d[0] - fabsf(s[1]));
+ ug = max(ug, d[0] + fabsf(s[1]));
+
+ // last row
+ lg = min(lg, d[n-1] - fabsf(s[n-1]));
+ ug = max(ug, d[n-1] + fabsf(s[n-1]));
+
+ // increase interval to avoid side effects of fp arithmetic
+ NumericT bnorm = max(fabsf(ug), fabsf(lg));
+
+ // these values depend on the implmentation of floating count that is
+ // employed in the following
+ NumericT psi_0 = 11 * FLT_EPSILON * bnorm;
+ NumericT psi_n = 11 * FLT_EPSILON * bnorm;
+
+ lg = lg - bnorm * 2 * static_cast<NumericT>(n) * FLT_EPSILON - psi_0;
+ ug = ug + bnorm * 2 * static_cast<NumericT>(n) * FLT_EPSILON + psi_n;
+
+ ug = max(lg, ug);
+ }
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif // _VIENNACL_LINALG_DETAIL_GERSCHORIN_H_
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
new file mode 100755
index 0000000..1943da3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/structs.hpp
@@ -0,0 +1,182 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_STRUCTS_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_STRUCTS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/structs.hpp
+ @brief Helper structures to simplify variable handling
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <assert.h>
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the input matrix is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct InputData
+{
+ //! host side representation of diagonal
+ std::vector<NumericT> std_a;
+ //! host side representation superdiagonal
+ std::vector<NumericT> std_b;
+ //! device side representation of diagonal
+ viennacl::vector<NumericT> g_a;
+ //!device side representation of superdiagonal
+ viennacl::vector<NumericT> g_b;
+
+ /** @brief Initialize the input data to the algorithm
+ *
+ * @param diagonal vector with the diagonal elements
+ * @param superdiagonal vector with the superdiagonal elements
+ * @param sz size of the matrix
+ */
+ InputData(std::vector<NumericT> diagonal, std::vector<NumericT> superdiagonal, const unsigned int sz) :
+ std_a(sz), std_b(sz), g_a(sz), g_b(sz)
+ {
+ std_a = diagonal;
+ std_b = superdiagonal;
+
+ viennacl::copy(std_b, g_b);
+ viennacl::copy(std_a, g_a);
+ }
+
+ InputData(viennacl::vector<NumericT> diagonal, viennacl::vector<NumericT> superdiagonal, const unsigned int sz) :
+ std_a(sz), std_b(sz), g_a(sz), g_b(sz)
+ {
+ g_a = diagonal;
+ g_b = superdiagonal;
+
+ viennacl::copy(g_a, std_a);
+ viennacl::copy(g_b, std_b);
+ }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the data of the result for small matrices is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct ResultDataSmall
+{
+ //! eigenvalues (host side)
+ std::vector<NumericT> std_eigenvalues;
+ //! left interval limits at the end of the computation
+ viennacl::vector<NumericT> vcl_g_left;
+ //! right interval limits at the end of the computation
+ viennacl::vector<NumericT> vcl_g_right;
+ //! number of eigenvalues smaller than the left interval limit
+ viennacl::vector<unsigned int> vcl_g_left_count;
+ //! number of eigenvalues bigger than the right interval limit
+ viennacl::vector<unsigned int> vcl_g_right_count;
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //! Initialize variables and memory for the result for small matrices
+ ////////////////////////////////////////////////////////////////////////////////
+ ResultDataSmall(const unsigned int mat_size) :
+ std_eigenvalues(mat_size), vcl_g_left(mat_size), vcl_g_right(mat_size), vcl_g_left_count(mat_size), vcl_g_right_count(mat_size) {}
+};
+
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////
+//! In this class the data of the result for large matrices is stored
+/////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+struct ResultDataLarge
+{
+//! eigenvalues
+ std::vector<NumericT> std_eigenvalues;
+
+ //! number of intervals containing one eigenvalue after the first step
+ viennacl::scalar<unsigned int> g_num_one;
+
+ //! number of (thread) blocks of intervals containing multiple eigenvalues after the first steo
+ viennacl::scalar<unsigned int> g_num_blocks_mult;
+
+ //! left interval limits of intervals containing one eigenvalue after the first iteration step
+ viennacl::vector<NumericT> g_left_one;
+
+ //! right interval limits of intervals containing one eigenvalue after the first iteration step
+ viennacl::vector<NumericT> g_right_one;
+
+ //! interval indices (position in sorted listed of eigenvalues) of intervals containing one eigenvalue after the first iteration step
+ viennacl::vector<unsigned int> g_pos_one;
+
+ //! left interval limits of intervals containing multiple eigenvalues after the first iteration step
+ viennacl::vector<NumericT> g_left_mult;
+ //! right interval limits of intervals containing multiple eigenvalues after the first iteration step
+ viennacl::vector<NumericT> g_right_mult;
+
+ //! number of eigenvalues less than the left limit of the eigenvalue intervals containing multiple eigenvalues
+ viennacl::vector<unsigned int> g_left_count_mult;
+
+ //! number of eigenvalues less than the right limit of the eigenvalue intervals containing multiple eigenvalues
+ viennacl::vector<unsigned int> g_right_count_mult;
+ //! start addresses in g_left_mult etc. of blocks of intervals containing more than one eigenvalue after the first step
+ viennacl::vector<unsigned int> g_blocks_mult;
+
+ //! accumulated number of intervals in g_left_mult etc. of blocks of intervals containing more than one eigenvalue after the first step
+ viennacl::vector<unsigned int> g_blocks_mult_sum;
+
+ //! eigenvalues that have been generated in the second step from intervals that still contained multiple eigenvalues after the first step
+ viennacl::vector<NumericT> g_lambda_mult;
+
+ //! eigenvalue index of intervals that have been generated in the second processing step
+ viennacl::vector<unsigned int> g_pos_mult;
+
+ /** @brief Initialize variables and memory for result
+ *
+ * @param mat_size size of the matrix
+ */
+ ResultDataLarge(unsigned int mat_size) :
+ std_eigenvalues(mat_size), g_num_one(0), g_num_blocks_mult(0),
+ g_left_one(mat_size), g_right_one(mat_size), g_pos_one(mat_size),
+ g_left_mult(mat_size), g_right_mult(mat_size), g_left_count_mult(mat_size), g_right_count_mult(mat_size),
+ g_blocks_mult(mat_size), g_blocks_mult_sum(mat_size), g_lambda_mult(mat_size), g_pos_mult(mat_size) {}
+
+};
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_DETAIL_STRUCTS_HPP_
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
new file mode 100755
index 0000000..883d202
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/bisect/util.hpp
@@ -0,0 +1,106 @@
+#ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+#define VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/detail//bisect/util.hpp
+ @brief Utility functions
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+////////////////////////////////////////////////////////////////////////////////
+//! Minimum
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+#ifdef __CUDACC__
+__host__ __device__
+#endif
+T
+min(const T &lhs, const T &rhs)
+{
+
+ return (lhs < rhs) ? lhs : rhs;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Maximum
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+#ifdef __CUDACC__
+__host__ __device__
+#endif
+T
+max(const T &lhs, const T &rhs)
+{
+
+ return (lhs < rhs) ? rhs : lhs;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Sign of number (float)
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __CUDACC__
+__host__ __device__
+#endif
+inline float
+sign_f(const float &val)
+{
+ return (val < 0.0f) ? -1.0f : 1.0f;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Sign of number (double)
+////////////////////////////////////////////////////////////////////////////////
+#ifdef __CUDACC__
+__host__ __device__
+#endif
+inline double
+sign_d(const double &val)
+{
+ return (val < 0.0) ? -1.0 : 1.0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! Get the number of blocks that are required to process \a num_threads with
+//! \a num_threads_blocks threads per block
+///////////////////////////////////////////////////////////////////////////////
+extern "C"
+inline
+unsigned int
+getNumBlocksLinear(const unsigned int num_threads,
+ const unsigned int num_threads_block)
+{
+ const unsigned int block_rem =
+ ((num_threads % num_threads_block) != 0) ? 1 : 0;
+ return (num_threads / num_threads_block) + block_rem;
+}
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_DETAIL_UTIL_HPP_
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
new file mode 100644
index 0000000..1540e2d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/block_ilu.hpp
@@ -0,0 +1,617 @@
+#ifndef VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
+#define VIENNACL_LINALG_DETAIL_BLOCK_ILU_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/block_ilu.hpp
+ @brief Implementations of incomplete block factorization preconditioners
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/linalg/detail/ilu/ilu0.hpp"
+#include "viennacl/linalg/detail/ilu/ilut.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+ /** @brief Helper range class for representing a subvector of a larger buffer. */
+ template<typename VectorT, typename NumericT, typename SizeT = vcl_size_t>
+ class ilu_vector_range
+ {
+ public:
+ ilu_vector_range(VectorT & v,
+ SizeT start_index,
+ SizeT vec_size
+ ) : vec_(v), start_(start_index), size_(vec_size) {}
+
+ NumericT & operator()(SizeT index)
+ {
+ assert(index < size_ && bool("Index out of bounds!"));
+ return vec_[start_ + index];
+ }
+
+ NumericT & operator[](SizeT index)
+ {
+ assert(index < size_ && bool("Index out of bounds!"));
+ return vec_[start_ + index];
+ }
+
+ SizeT size() const { return size_; }
+
+ private:
+ VectorT & vec_;
+ SizeT start_;
+ SizeT size_;
+ };
+
+ /** @brief Extracts a diagonal block from a larger system matrix
+ *
+ * @param A The full matrix
+ * @param diagonal_block_A The output matrix, to which the extracted block is written to
+ * @param start_index First row- and column-index of the block
+ * @param stop_index First row- and column-index beyond the block
+ */
+ template<typename NumericT>
+ void extract_block_matrix(viennacl::compressed_matrix<NumericT> const & A,
+ viennacl::compressed_matrix<NumericT> & diagonal_block_A,
+ vcl_size_t start_index,
+ vcl_size_t stop_index
+ )
+ {
+ assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+ assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+ assert( (A.handle().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+
+ NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ NumericT * output_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(diagonal_block_A.handle());
+ unsigned int * output_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(diagonal_block_A.handle1());
+ unsigned int * output_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(diagonal_block_A.handle2());
+
+ vcl_size_t output_counter = 0;
+ for (vcl_size_t row = start_index; row < stop_index; ++row)
+ {
+ unsigned int buffer_col_start = A_row_buffer[row];
+ unsigned int buffer_col_end = A_row_buffer[row+1];
+
+ output_row_buffer[row - start_index] = static_cast<unsigned int>(output_counter);
+
+ for (unsigned int buf_index = buffer_col_start; buf_index < buffer_col_end; ++buf_index)
+ {
+ unsigned int col = A_col_buffer[buf_index];
+ if (col < start_index)
+ continue;
+
+ if (col >= static_cast<unsigned int>(stop_index))
+ continue;
+
+ output_col_buffer[output_counter] = static_cast<unsigned int>(col - start_index);
+ output_elements[output_counter] = A_elements[buf_index];
+ ++output_counter;
+ }
+ output_row_buffer[row - start_index + 1] = static_cast<unsigned int>(output_counter);
+ }
+ }
+
+} // namespace detail
+
+
+
+/** @brief A block ILU preconditioner class, can be supplied to solve()-routines
+ *
+ * @tparam MatrixType Type of the system matrix
+ * @tparam ILUTag Type of the tag identifiying the ILU preconditioner to be used on each block.
+*/
+template<typename MatrixT, typename ILUTag>
+class block_ilu_precond
+{
+typedef typename MatrixT::value_type ScalarType;
+
+public:
+ typedef std::vector<std::pair<vcl_size_t, vcl_size_t> > index_vector_type; //the pair refers to index range [a, b) of each block
+
+
+ block_ilu_precond(MatrixT const & mat,
+ ILUTag const & tag,
+ vcl_size_t num_blocks = 8
+ ) : tag_(tag), L_blocks(num_blocks), U_blocks(num_blocks)
+ {
+ // Set up vector of block indices:
+ block_indices_.resize(num_blocks);
+ for (vcl_size_t i=0; i<num_blocks; ++i)
+ {
+ vcl_size_t start_index = ( i * mat.size1()) / num_blocks;
+ vcl_size_t stop_index = ((i+1) * mat.size1()) / num_blocks;
+
+ block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, stop_index);
+ }
+
+ //initialize preconditioner:
+ //std::cout << "Start CPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End CPU precond" << std::endl;
+ }
+
+ block_ilu_precond(MatrixT const & mat,
+ ILUTag const & tag,
+ index_vector_type const & block_boundaries
+ ) : tag_(tag), block_indices_(block_boundaries), L_blocks(block_boundaries.size()), U_blocks(block_boundaries.size())
+ {
+ //initialize preconditioner:
+ //std::cout << "Start CPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End CPU precond" << std::endl;
+ }
+
+
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+ apply_dispatch(vec, i, ILUTag());
+ }
+
+private:
+ void init(MatrixT const & A)
+ {
+ viennacl::context host_context(viennacl::MAIN_MEMORY);
+ viennacl::compressed_matrix<ScalarType> mat(host_context);
+
+ viennacl::copy(A, mat);
+
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(mat.handle1());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i2=0; i2<static_cast<long>(block_indices_.size()); ++i2)
+ {
+ vcl_size_t i = static_cast<vcl_size_t>(i2);
+ // Step 1: Extract blocks
+ vcl_size_t block_size = block_indices_[i].second - block_indices_[i].first;
+ vcl_size_t block_nnz = row_buffer[block_indices_[i].second] - row_buffer[block_indices_[i].first];
+ viennacl::compressed_matrix<ScalarType> mat_block(block_size, block_size, block_nnz, host_context);
+
+ detail::extract_block_matrix(mat, mat_block, block_indices_[i].first, block_indices_[i].second);
+
+ // Step 2: Precondition blocks:
+ viennacl::switch_memory_context(L_blocks[i], host_context);
+ viennacl::switch_memory_context(U_blocks[i], host_context);
+ init_dispatch(mat_block, L_blocks[i], U_blocks[i], tag_);
+ }
+
+ }
+
+ void init_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+ viennacl::compressed_matrix<ScalarType> & L,
+ viennacl::compressed_matrix<ScalarType> & U,
+ viennacl::linalg::ilu0_tag)
+ {
+ (void)U;
+ L = mat_block;
+ viennacl::linalg::precondition(L, tag_);
+ }
+
+ void init_dispatch(viennacl::compressed_matrix<ScalarType> const & mat_block,
+ viennacl::compressed_matrix<ScalarType> & L,
+ viennacl::compressed_matrix<ScalarType> & U,
+ viennacl::linalg::ilut_tag)
+ {
+ L.resize(mat_block.size1(), mat_block.size2());
+ U.resize(mat_block.size1(), mat_block.size2());
+ viennacl::linalg::precondition(mat_block, L, U, tag_);
+ }
+
+ template<typename VectorT>
+ void apply_dispatch(VectorT & vec, vcl_size_t i, viennacl::linalg::ilu0_tag) const
+ {
+ detail::ilu_vector_range<VectorT, ScalarType> vec_range(vec, block_indices_[i].first, L_blocks[i].size2());
+
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks[i].handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks[i].handle2());
+ ScalarType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(L_blocks[i].handle());
+
+ viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, L_blocks[i].size2(), unit_lower_tag());
+ viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, L_blocks[i].size2(), upper_tag());
+ }
+
+ template<typename VectorT>
+ void apply_dispatch(VectorT & vec, vcl_size_t i, viennacl::linalg::ilut_tag) const
+ {
+ detail::ilu_vector_range<VectorT, ScalarType> vec_range(vec, block_indices_[i].first, L_blocks[i].size2());
+
+ {
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks[i].handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks[i].handle2());
+ ScalarType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(L_blocks[i].handle());
+
+ viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, L_blocks[i].size2(), unit_lower_tag());
+ }
+
+ {
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks[i].handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks[i].handle2());
+ ScalarType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<ScalarType>(U_blocks[i].handle());
+
+ viennacl::linalg::host_based::detail::csr_inplace_solve<ScalarType>(row_buffer, col_buffer, elements, vec_range, U_blocks[i].size2(), upper_tag());
+ }
+ }
+
+ ILUTag tag_;
+ index_vector_type block_indices_;
+ std::vector< viennacl::compressed_matrix<ScalarType> > L_blocks;
+ std::vector< viennacl::compressed_matrix<ScalarType> > U_blocks;
+};
+
+
+
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV, typename ILUTagT>
+class block_ilu_precond< compressed_matrix<NumericT, AlignmentV>, ILUTagT>
+{
+ typedef compressed_matrix<NumericT, AlignmentV> MatrixType;
+
+public:
+ typedef std::vector<std::pair<vcl_size_t, vcl_size_t> > index_vector_type; //the pair refers to index range [a, b) of each block
+
+
+ block_ilu_precond(MatrixType const & mat,
+ ILUTagT const & tag,
+ vcl_size_t num_blocks = 8
+ ) : tag_(tag),
+ block_indices_(num_blocks),
+ gpu_block_indices_(),
+ gpu_L_trans_(0, 0, viennacl::context(viennacl::MAIN_MEMORY)),
+ gpu_U_trans_(0, 0, viennacl::context(viennacl::MAIN_MEMORY)),
+ gpu_D_(mat.size1(), viennacl::context(viennacl::MAIN_MEMORY)),
+ L_blocks_(num_blocks),
+ U_blocks_(num_blocks)
+ {
+ // Set up vector of block indices:
+ block_indices_.resize(num_blocks);
+ for (vcl_size_t i=0; i<num_blocks; ++i)
+ {
+ vcl_size_t start_index = ( i * mat.size1()) / num_blocks;
+ vcl_size_t stop_index = ((i+1) * mat.size1()) / num_blocks;
+
+ block_indices_[i] = std::pair<vcl_size_t, vcl_size_t>(start_index, stop_index);
+ }
+
+ //initialize preconditioner:
+ //std::cout << "Start CPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End CPU precond" << std::endl;
+ }
+
+ block_ilu_precond(MatrixType const & mat,
+ ILUTagT const & tag,
+ index_vector_type const & block_boundaries
+ ) : tag_(tag),
+ block_indices_(block_boundaries),
+ gpu_block_indices_(),
+ gpu_L_trans_(0, 0, viennacl::context(viennacl::MAIN_MEMORY)),
+ gpu_U_trans_(0, 0, viennacl::context(viennacl::MAIN_MEMORY)),
+ gpu_D_(mat.size1(), viennacl::context(viennacl::MAIN_MEMORY)),
+ L_blocks_(block_boundaries.size()),
+ U_blocks_(block_boundaries.size())
+ {
+ //initialize preconditioner:
+ //std::cout << "Start CPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End CPU precond" << std::endl;
+ }
+
+
+ void apply(vector<NumericT> & vec) const
+ {
+ viennacl::linalg::detail::block_inplace_solve(trans(gpu_L_trans_), gpu_block_indices_, block_indices_.size(), gpu_D_,
+ vec,
+ viennacl::linalg::unit_lower_tag());
+
+ viennacl::linalg::detail::block_inplace_solve(trans(gpu_U_trans_), gpu_block_indices_, block_indices_.size(), gpu_D_,
+ vec,
+ viennacl::linalg::upper_tag());
+
+ //apply_cpu(vec);
+ }
+
+
+private:
+
+ void init(MatrixType const & A)
+ {
+ viennacl::context host_context(viennacl::MAIN_MEMORY);
+ viennacl::compressed_matrix<NumericT> mat(host_context);
+
+ mat = A;
+
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(mat.handle1());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i=0; i<static_cast<long>(block_indices_.size()); ++i)
+ {
+ // Step 1: Extract blocks
+ vcl_size_t block_size = block_indices_[static_cast<vcl_size_t>(i)].second - block_indices_[static_cast<vcl_size_t>(i)].first;
+ vcl_size_t block_nnz = row_buffer[block_indices_[static_cast<vcl_size_t>(i)].second] - row_buffer[block_indices_[static_cast<vcl_size_t>(i)].first];
+ viennacl::compressed_matrix<NumericT> mat_block(block_size, block_size, block_nnz, host_context);
+
+ detail::extract_block_matrix(mat, mat_block, block_indices_[static_cast<vcl_size_t>(i)].first, block_indices_[static_cast<vcl_size_t>(i)].second);
+
+ // Step 2: Precondition blocks:
+ viennacl::switch_memory_context(L_blocks_[static_cast<vcl_size_t>(i)], host_context);
+ viennacl::switch_memory_context(U_blocks_[static_cast<vcl_size_t>(i)], host_context);
+ init_dispatch(mat_block, L_blocks_[static_cast<vcl_size_t>(i)], U_blocks_[static_cast<vcl_size_t>(i)], tag_);
+ }
+
+ /*
+ * copy resulting preconditioner back to GPU:
+ */
+ viennacl::backend::typesafe_host_array<unsigned int> block_indices_uint(gpu_block_indices_, 2 * block_indices_.size());
+ for (vcl_size_t i=0; i<block_indices_.size(); ++i)
+ {
+ block_indices_uint.set(2*i, block_indices_[i].first);
+ block_indices_uint.set(2*i + 1, block_indices_[i].second);
+ }
+
+ viennacl::backend::memory_create(gpu_block_indices_, block_indices_uint.raw_size(), viennacl::traits::context(A), block_indices_uint.get());
+
+ blocks_to_device(A);
+
+ }
+
+ // Copy computed preconditioned blocks to OpenCL device
+ void blocks_to_device(MatrixType const & A)
+ {
+ gpu_L_trans_.resize(A.size1(), A.size2());
+ gpu_U_trans_.resize(A.size1(), A.size2());
+ gpu_D_.resize(A.size1());
+
+ unsigned int * L_trans_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(gpu_L_trans_.handle1());
+ unsigned int * U_trans_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(gpu_U_trans_.handle1());
+
+ //
+ // Count elements per row
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long block_index2 = 0; block_index2 < static_cast<long>(L_blocks_.size()); ++block_index2)
+ {
+ vcl_size_t block_index = vcl_size_t(block_index2);
+
+ unsigned int block_start = static_cast<unsigned int>(block_indices_[block_index].first);
+ unsigned int block_stop = static_cast<unsigned int>(block_indices_[block_index].second);
+
+ unsigned int const * L_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks_[block_index].handle1());
+ unsigned int const * L_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks_[block_index].handle2());
+
+ // zero row array of L:
+ std::fill(L_trans_row_buffer + block_start,
+ L_trans_row_buffer + block_stop,
+ static_cast<unsigned int>(0));
+
+ // count number of elements per row:
+ for (vcl_size_t row = 0; row < L_blocks_[block_index].size1(); ++row)
+ {
+ unsigned int col_start = L_row_buffer[row];
+ unsigned int col_end = L_row_buffer[row+1];
+
+ for (unsigned int j = col_start; j < col_end; ++j)
+ {
+ unsigned int col = L_col_buffer[j];
+ if (col < static_cast<unsigned int>(row))
+ L_trans_row_buffer[col + block_start] += 1;
+ }
+ }
+
+ ////// same for U
+
+ unsigned int const * U_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks_[block_index].handle1());
+ unsigned int const * U_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks_[block_index].handle2());
+
+ // zero row array of U:
+ std::fill(U_trans_row_buffer + block_start,
+ U_trans_row_buffer + block_stop,
+ static_cast<unsigned int>(0));
+
+ // count number of elements per row:
+ for (vcl_size_t row = 0; row < U_blocks_[block_index].size1(); ++row)
+ {
+ unsigned int col_start = U_row_buffer[row];
+ unsigned int col_end = U_row_buffer[row+1];
+
+ for (unsigned int j = col_start; j < col_end; ++j)
+ {
+ unsigned int col = U_col_buffer[j];
+ if (col > row)
+ U_trans_row_buffer[col + block_start] += 1;
+ }
+ }
+ }
+
+
+ //
+ // Exclusive scan on row buffer (feel free to add parallelization here)
+ //
+ unsigned int current_value = 0;
+ for (vcl_size_t i=0; i<gpu_L_trans_.size1(); ++i)
+ {
+ unsigned int tmp = L_trans_row_buffer[i];
+ L_trans_row_buffer[i] = current_value;
+ current_value += tmp;
+ }
+ gpu_L_trans_.reserve(current_value);
+
+ current_value = 0;
+ for (vcl_size_t i=0; i<gpu_U_trans_.size1(); ++i)
+ {
+ unsigned int tmp = U_trans_row_buffer[i];
+ U_trans_row_buffer[i] = current_value;
+ current_value += tmp;
+ }
+ gpu_U_trans_.reserve(current_value);
+
+
+ //
+ // Fill with data
+ //
+ unsigned int * L_trans_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(gpu_L_trans_.handle2());
+ NumericT * L_trans_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_L_trans_.handle());
+
+ unsigned int * U_trans_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(gpu_U_trans_.handle2());
+ NumericT * U_trans_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_U_trans_.handle());
+
+ NumericT * D_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(gpu_D_.handle());
+
+ std::vector<unsigned int> offset_L(gpu_L_trans_.size1());
+ std::vector<unsigned int> offset_U(gpu_U_trans_.size1());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long block_index2 = 0; block_index2 < static_cast<long>(L_blocks_.size()); ++block_index2)
+ {
+ vcl_size_t block_index = vcl_size_t(block_index2);
+ unsigned int block_start = static_cast<unsigned int>(block_indices_[block_index].first);
+
+ unsigned int const * L_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks_[block_index].handle1());
+ unsigned int const * L_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_blocks_[block_index].handle2());
+ NumericT const * L_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT >(L_blocks_[block_index].handle());
+
+
+ // write L_trans:
+ for (vcl_size_t row = 0; row < L_blocks_[block_index].size1(); ++row)
+ {
+ unsigned int col_start = L_row_buffer[row];
+ unsigned int col_end = L_row_buffer[row+1];
+
+ for (unsigned int j = col_start; j < col_end; ++j)
+ {
+ unsigned int col = L_col_buffer[j];
+ if (col < row)
+ {
+ unsigned int row_trans = col + block_start;
+ unsigned int k = L_trans_row_buffer[row_trans] + offset_L[row_trans];
+ offset_L[row_trans] += 1;
+
+ L_trans_col_buffer[k] = static_cast<unsigned int>(row) + block_start;
+ L_trans_elements[k] = L_elements[j];
+ }
+ }
+ }
+
+ unsigned int const * U_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks_[block_index].handle1());
+ unsigned int const * U_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_blocks_[block_index].handle2());
+ NumericT const * U_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT >(U_blocks_[block_index].handle());
+
+ // write U_trans and D:
+ for (vcl_size_t row = 0; row < U_blocks_[block_index].size1(); ++row)
+ {
+ unsigned int col_start = U_row_buffer[row];
+ unsigned int col_end = U_row_buffer[row+1];
+
+ for (unsigned int j = col_start; j < col_end; ++j)
+ {
+ unsigned int row_trans = U_col_buffer[j] + block_start;
+ unsigned int k = U_trans_row_buffer[row_trans] + offset_U[row_trans];
+
+ if (row_trans == row + block_start) // entry for D
+ {
+ D_elements[row_trans] = U_elements[j];
+ }
+ else if (row_trans > row + block_start) //entry for U
+ {
+ offset_U[row_trans] += 1;
+
+ U_trans_col_buffer[k] = static_cast<unsigned int>(row) + block_start;
+ U_trans_elements[k] = U_elements[j];
+ }
+ }
+ }
+
+ }
+
+ //
+ // Send to destination device:
+ //
+ viennacl::switch_memory_context(gpu_L_trans_, viennacl::traits::context(A));
+ viennacl::switch_memory_context(gpu_U_trans_, viennacl::traits::context(A));
+ viennacl::switch_memory_context(gpu_D_, viennacl::traits::context(A));
+ }
+
+ void init_dispatch(viennacl::compressed_matrix<NumericT> const & mat_block,
+ viennacl::compressed_matrix<NumericT> & L,
+ viennacl::compressed_matrix<NumericT> & U,
+ viennacl::linalg::ilu0_tag)
+ {
+ L = mat_block;
+ viennacl::linalg::precondition(L, tag_);
+ U = L; // fairly poor workaround...
+ }
+
+ void init_dispatch(viennacl::compressed_matrix<NumericT> const & mat_block,
+ viennacl::compressed_matrix<NumericT> & L,
+ viennacl::compressed_matrix<NumericT> & U,
+ viennacl::linalg::ilut_tag)
+ {
+ L.resize(mat_block.size1(), mat_block.size2());
+ U.resize(mat_block.size1(), mat_block.size2());
+ viennacl::linalg::precondition(mat_block, L, U, tag_);
+ }
+
+
+ ILUTagT tag_;
+ index_vector_type block_indices_;
+ viennacl::backend::mem_handle gpu_block_indices_;
+ viennacl::compressed_matrix<NumericT> gpu_L_trans_;
+ viennacl::compressed_matrix<NumericT> gpu_U_trans_;
+ viennacl::vector<NumericT> gpu_D_;
+
+ std::vector<MatrixType> L_blocks_;
+ std::vector<MatrixType> U_blocks_;
+};
+
+
+}
+}
+
+
+
+
+#endif
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
new file mode 100644
index 0000000..7628cdb
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
@@ -0,0 +1,316 @@
+#ifndef VIENNACL_LINALG_DETAIL_CHOW_PATEL_ILU_HPP_
+#define VIENNACL_LINALG_DETAIL_CHOW_PATEL_ILU_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/chow_patel_ilu.hpp
+ @brief Implementations of incomplete factorization preconditioners with fine-grained parallelism.
+
+ Based on "Fine-Grained Parallel Incomplete LU Factorization" by Chow and Patel, SIAM J. Sci. Comput., vol. 37, no. 2, pp. C169-C193
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/linalg/ilu_operations.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/backend/memory.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU and incomplete Cholesky factorization with static pattern (Parallel-ILU0, Parallel ICC0)
+*/
+class chow_patel_tag
+{
+public:
+ /** @brief Constructor allowing to set the number of sweeps and Jacobi iterations.
+ *
+ * @param num_sweeps Number of sweeps in setup phase
+ * @param num_jacobi_iters Number of Jacobi iterations for each triangular 'solve' when applying the preconditioner to a vector
+ */
+ chow_patel_tag(vcl_size_t num_sweeps = 3, vcl_size_t num_jacobi_iters = 2) : sweeps_(num_sweeps), jacobi_iters_(num_jacobi_iters) {}
+
+ /** @brief Returns the number of sweeps (i.e. number of nonlinear iterations) in the solver setup stage */
+ vcl_size_t sweeps() const { return sweeps_; }
+ /** @brief Sets the number of sweeps (i.e. number of nonlinear iterations) in the solver setup stage */
+ void sweeps(vcl_size_t num) { sweeps_ = num; }
+
+ /** @brief Returns the number of Jacobi iterations (i.e. applications of x_{k+1} = (I - D^{-1}R)x_k + D^{-1} b) for each of the solves y = U^{-1} x and z = L^{-1} y) for each preconditioner application. */
+ vcl_size_t jacobi_iters() const { return jacobi_iters_; }
+ /** @brief Sets the number of Jacobi iterations for each triangular 'solve' when applying the preconditioner to a vector. */
+ void jacobi_iters(vcl_size_t num) { jacobi_iters_ = num; }
+
+private:
+ vcl_size_t sweeps_;
+ vcl_size_t jacobi_iters_;
+};
+
+namespace detail
+{
+ /** @brief Implementation of the parallel ICC0 factorization, Algorithm 3 in Chow-Patel paper.
+ *
+ * Rather than dealing with a column-major upper triangular matrix U, we use the lower-triangular matrix L such that A is approximately given by LL^T.
+ * The advantage is that L is readily available in row-major format.
+ */
+ template<typename NumericT>
+ void precondition(viennacl::compressed_matrix<NumericT> const & A,
+ viennacl::compressed_matrix<NumericT> & L,
+ viennacl::vector<NumericT> & diag_L,
+ viennacl::compressed_matrix<NumericT> & L_trans,
+ chow_patel_tag const & tag)
+ {
+ // make sure L and U have correct dimensions:
+ L.resize(A.size1(), A.size2(), false);
+
+ // initialize L and U from values in A:
+ viennacl::linalg::extract_L(A, L);
+
+ // diagonally scale values from A in L:
+ viennacl::linalg::icc_scale(A, L);
+
+ viennacl::vector<NumericT> aij_L(L.nnz(), viennacl::traits::context(A));
+ viennacl::backend::memory_copy(L.handle(), aij_L.handle(), 0, 0, sizeof(NumericT) * L.nnz());
+
+ // run sweeps:
+ for (vcl_size_t i=0; i<tag.sweeps(); ++i)
+ viennacl::linalg::icc_chow_patel_sweep(L, aij_L);
+
+ // transpose L to obtain L_trans:
+ viennacl::linalg::ilu_transpose(L, L_trans);
+
+ // form (I - D_L^{-1}L) and (I - D_U^{-1} U), with U := L_trans
+ viennacl::linalg::ilu_form_neumann_matrix(L, diag_L);
+ viennacl::linalg::ilu_form_neumann_matrix(L_trans, diag_L);
+ }
+
+
+ /** @brief Implementation of the parallel ILU0 factorization, Algorithm 2 in Chow-Patel paper. */
+ template<typename NumericT>
+ void precondition(viennacl::compressed_matrix<NumericT> const & A,
+ viennacl::compressed_matrix<NumericT> & L,
+ viennacl::vector<NumericT> & diag_L,
+ viennacl::compressed_matrix<NumericT> & U,
+ viennacl::vector<NumericT> & diag_U,
+ chow_patel_tag const & tag)
+ {
+ // make sure L and U have correct dimensions:
+ L.resize(A.size1(), A.size2(), false);
+ U.resize(A.size1(), A.size2(), false);
+
+ // initialize L and U from values in A:
+ viennacl::linalg::extract_LU(A, L, U);
+
+ // diagonally scale values from A in L and U:
+ viennacl::linalg::ilu_scale(A, L, U);
+
+ // transpose storage layout of U from CSR to CSC via transposition
+ viennacl::compressed_matrix<NumericT> U_trans;
+ viennacl::linalg::ilu_transpose(U, U_trans);
+
+ // keep entries of a_ij for the sweeps
+ viennacl::vector<NumericT> aij_L (L.nnz(), viennacl::traits::context(A));
+ viennacl::vector<NumericT> aij_U_trans(U_trans.nnz(), viennacl::traits::context(A));
+
+ viennacl::backend::memory_copy( L.handle(), aij_L.handle(), 0, 0, sizeof(NumericT) * L.nnz());
+ viennacl::backend::memory_copy(U_trans.handle(), aij_U_trans.handle(), 0, 0, sizeof(NumericT) * U_trans.nnz());
+
+ // run sweeps:
+ for (vcl_size_t i=0; i<tag.sweeps(); ++i)
+ viennacl::linalg::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+
+ // transpose U_trans back:
+ viennacl::linalg::ilu_transpose(U_trans, U);
+
+ // form (I - D_L^{-1}L) and (I - D_U^{-1} U)
+ viennacl::linalg::ilu_form_neumann_matrix(L, diag_L);
+ viennacl::linalg::ilu_form_neumann_matrix(U, diag_U);
+ }
+
+}
+
+
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class chow_patel_icc_precond
+{
+ // only works with compressed_matrix!
+ typedef typename MatrixT::CHOW_PATEL_ICC_ONLY_WORKS_WITH_COMPRESSED_MATRIX error_type;
+};
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class chow_patel_icc_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+
+public:
+ chow_patel_icc_precond(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, chow_patel_tag const & tag)
+ : tag_(tag),
+ L_(0, 0, 0, viennacl::traits::context(A)),
+ diag_L_(A.size1(), viennacl::traits::context(A)),
+ L_trans_(0, 0, 0, viennacl::traits::context(A)),
+ x_k_(A.size1(), viennacl::traits::context(A)),
+ b_(A.size1(), viennacl::traits::context(A))
+ {
+ viennacl::linalg::detail::precondition(A, L_, diag_L_, L_trans_, tag_);
+ }
+
+ /** @brief Preconditioner application: LL^Tx = b, computed via Ly = b, L^Tx = y using Jacobi iterations.
+ *
+ * L contains (I - D_L^{-1}L), L_trans contains (I - D_L^{-1}L^T) where D denotes the respective diagonal matrix
+ */
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ //
+ // y = L^{-1} b through Jacobi iteration y_{k+1} = (I - D^{-1}L)y_k + D^{-1}x
+ //
+ b_ = viennacl::linalg::element_div(vec, diag_L_);
+ x_k_ = b_;
+ for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+ {
+ vec = viennacl::linalg::prod(L_, x_k_);
+ x_k_ = vec + b_;
+ }
+
+ //
+ // x = U^{-1} y through Jacobi iteration x_{k+1} = (I - D^{-1}L^T)x_k + D^{-1}b
+ //
+ b_ = viennacl::linalg::element_div(x_k_, diag_L_);
+ x_k_ = b_; // x_1 if x_0 \equiv 0
+ for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+ {
+ vec = viennacl::linalg::prod(L_trans_, x_k_);
+ x_k_ = vec + b_;
+ }
+
+ // return result:
+ vec = x_k_;
+ }
+
+private:
+ chow_patel_tag tag_;
+ viennacl::compressed_matrix<NumericT> L_;
+ viennacl::vector<NumericT> diag_L_;
+ viennacl::compressed_matrix<NumericT> L_trans_;
+
+ mutable viennacl::vector<NumericT> x_k_;
+ mutable viennacl::vector<NumericT> b_;
+};
+
+
+
+
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class chow_patel_ilu_precond
+{
+ // only works with compressed_matrix!
+ typedef typename MatrixT::CHOW_PATEL_ILU_ONLY_WORKS_WITH_COMPRESSED_MATRIX error_type;
+};
+
+
+/** @brief Parallel Chow-Patel ILU preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class chow_patel_ilu_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+
+public:
+ chow_patel_ilu_precond(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, chow_patel_tag const & tag)
+ : tag_(tag),
+ L_(0, 0, 0, viennacl::traits::context(A)),
+ diag_L_(A.size1(), viennacl::traits::context(A)),
+ U_(0, 0, 0, viennacl::traits::context(A)),
+ diag_U_(A.size1(), viennacl::traits::context(A)),
+ x_k_(A.size1(), viennacl::traits::context(A)),
+ b_(A.size1(), viennacl::traits::context(A))
+ {
+ viennacl::linalg::detail::precondition(A, L_, diag_L_, U_, diag_U_, tag_);
+ }
+
+ /** @brief Preconditioner application: LUx = b, computed via Ly = b, Ux = y using Jacobi iterations.
+ *
+ * L_ contains (I - D_L^{-1}L), U_ contains (I - D_U^{-1}U) where D denotes the respective diagonal matrix
+ */
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ //
+ // y = L^{-1} b through Jacobi iteration y_{k+1} = (I - D^{-1}L)y_k + D^{-1}x
+ //
+ b_ = viennacl::linalg::element_div(vec, diag_L_);
+ x_k_ = b_;
+ for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+ {
+ vec = viennacl::linalg::prod(L_, x_k_);
+ x_k_ = vec + b_;
+ }
+
+ //
+ // x = U^{-1} y through Jacobi iteration x_{k+1} = (I - D^{-1}U)x_k + D^{-1}b
+ //
+ b_ = viennacl::linalg::element_div(x_k_, diag_U_);
+ x_k_ = b_; // x_1 if x_0 \equiv 0
+ for (unsigned int i=0; i<tag_.jacobi_iters(); ++i)
+ {
+ vec = viennacl::linalg::prod(U_, x_k_);
+ x_k_ = vec + b_;
+ }
+
+ // return result:
+ vec = x_k_;
+ }
+
+private:
+ chow_patel_tag tag_;
+ viennacl::compressed_matrix<NumericT> L_;
+ viennacl::vector<NumericT> diag_L_;
+ viennacl::compressed_matrix<NumericT> U_;
+ viennacl::vector<NumericT> diag_U_;
+
+ mutable viennacl::vector<NumericT> x_k_;
+ mutable viennacl::vector<NumericT> b_;
+};
+
+
+} // namespace linalg
+} // namespace viennacl
+
+
+#endif
+
+
+
[25/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp
new file mode 100644
index 0000000..907eb57
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp
@@ -0,0 +1,497 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_QR_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/qr.hpp
+ @brief Implementation of a simultaneous QR factorization of multiple matrices. Experimental.
+
+ SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <cmath>
+#include <sstream>
+#include "viennacl/ocl/backend.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+//********** DEBUG FUNCTIONS *****************//
+template< typename T, typename InputIteratorT>
+void Print(std::ostream & ostr, InputIteratorT it_begin, InputIteratorT it_end)
+{
+ //std::ostream_iterator<int> it_os(ostr, delimiter);
+ std::string delimiters = " ";
+ std::copy(it_begin, it_end, std::ostream_iterator<T>(ostr, delimiters.c_str()));
+ ostr << std::endl;
+}
+
+template<typename VectorT, typename MatrixT>
+void write_to_block(VectorT & con_A_I_J,
+ unsigned int start_ind,
+ std::vector<unsigned int> const & I,
+ std::vector<unsigned int> const & J,
+ MatrixT& m)
+{
+ m.resize(I.size(), J.size(), false);
+ for (vcl_size_t i = 0; i < J.size(); ++i)
+ for (vcl_size_t j = 0; j < I.size(); ++j)
+ m(j,i) = con_A_I_J[start_ind + i*I.size() + j];
+}
+
+template<typename VectorT>
+void print_continious_matrix(VectorT & con_A_I_J,
+ std::vector<cl_uint> & blocks_ind,
+ std::vector<std::vector<unsigned int> > const & g_I,
+ std::vector<std::vector<unsigned int> > const & g_J)
+{
+ typedef typename VectorT::value_type NumericType;
+
+ std::vector<boost::numeric::ublas::matrix<NumericType> > com_A_I_J(g_I.size());
+ for (vcl_size_t i = 0; i < g_I.size(); ++i)
+ {
+ write_to_block(con_A_I_J, blocks_ind[i], g_I[i], g_J[i], com_A_I_J[i]);
+ std::cout << com_A_I_J[i] << std::endl;
+ }
+}
+
+template<typename VectorT>
+void print_continious_vector(VectorT & con_v,
+ std::vector<cl_uint> & block_ind,
+ std::vector<std::vector<unsigned int> > const & g_J)
+{
+ typedef typename VectorT::value_type NumericType;
+
+ std::vector<boost::numeric::ublas::vector<NumericType> > com_v(g_J.size());
+ //Print<ScalarType>(std::cout, con_v.begin(), con_v.end());
+ for (vcl_size_t i = 0; i < g_J.size(); ++i)
+ {
+ com_v[i].resize(g_J[i].size());
+ for (vcl_size_t j = 0; j < g_J[i].size(); ++j)
+ com_v[i](j) = con_v[block_ind[i] + j];
+ std::cout << com_v[i] << std::endl;
+ }
+}
+
+
+///**************************************** BLOCK FUNCTIONS ************************************//
+
+/** @brief Computes size of elements, start indices and matrix dimensions for a certain block
+ *
+ * @param g_I container of row indices
+ * @param g_J container of column indices
+ * @param sz general size for all elements in a certain block
+ * @param blocks_ind start indices in a certain
+ * @param matrix_dims matrix dimensions for each block
+ */
+inline void compute_blocks_size(std::vector<std::vector<unsigned int> > const & g_I,
+ std::vector<std::vector<unsigned int> > const & g_J,
+ unsigned int& sz,
+ std::vector<cl_uint> & blocks_ind,
+ std::vector<cl_uint> & matrix_dims)
+{
+ sz = 0;
+ for (vcl_size_t i = 0; i < g_I.size(); ++i)
+ {
+ sz += static_cast<unsigned int>(g_I[i].size()*g_J[i].size());
+ matrix_dims[2*i] = static_cast<cl_uint>(g_I[i].size());
+ matrix_dims[2*i + 1] = static_cast<cl_uint>(g_J[i].size());
+ blocks_ind[i+1] = blocks_ind[i] + static_cast<cl_uint>(g_I[i].size()*g_J[i].size());
+ }
+}
+
+/** @brief Computes size of particular container of index set
+ *
+ * @param inds container of index sets
+ * @param size output size
+ */
+template<typename SizeT>
+void get_size(std::vector<std::vector<SizeT> > const & inds,
+ SizeT & size)
+{
+ size = 0;
+ for (vcl_size_t i = 0; i < inds.size(); ++i)
+ size += static_cast<unsigned int>(inds[i].size());
+}
+
+/** @brief Initializes start indices of particular index set
+ *
+ * @param inds container of index sets
+ * @param start_inds output index set
+ */
+template<typename SizeT>
+void init_start_inds(std::vector<std::vector<SizeT> > const & inds,
+ std::vector<cl_uint>& start_inds)
+{
+ for (vcl_size_t i = 0; i < inds.size(); ++i)
+ start_inds[i+1] = start_inds[i] + static_cast<cl_uint>(inds[i].size());
+}
+
+
+//************************************* QR FUNCTIONS ***************************************//
+
+/** @brief Dot prod of particular column of martix A with it's self starting at a certain index beg_ind
+ *
+ * @param A init matrix
+ * @param beg_ind starting index
+ * @param res result of dot product
+ */
+template<typename MatrixT, typename NumericT>
+void dot_prod(MatrixT const & A,
+ unsigned int beg_ind,
+ NumericT & res)
+{
+ res = NumericT(0);
+ for (vcl_size_t i = beg_ind; i < A.size1(); ++i)
+ res += A(i, beg_ind-1)*A(i, beg_ind-1);
+}
+
+/** @brief Dot prod of particular matrix column with arbitrary vector: A(:, col_ind)
+ *
+ * @param A init matrix
+ * @param v input vector
+ * @param col_ind starting column index
+ * @param start_ind starting index inside column
+ * @param res result of dot product
+ */
+template<typename MatrixT, typename VectorT, typename NumericT>
+void custom_inner_prod(MatrixT const & A,
+ VectorT const & v,
+ unsigned int col_ind,
+ unsigned int start_ind,
+ NumericT & res)
+{
+ res = static_cast<NumericT>(0);
+ for (unsigned int i = start_ind; i < static_cast<unsigned int>(A.size1()); ++i)
+ res += A(i, col_ind)*v(i);
+}
+
+/** @brief Copying part of matrix column
+ *
+ * @param A init matrix
+ * @param v output vector
+ * @param beg_ind start index for copying
+ */
+template<typename MatrixT, typename VectorT>
+void copy_vector(MatrixT const & A,
+ VectorT & v,
+ unsigned int beg_ind)
+{
+ for (unsigned int i = beg_ind; i < static_cast<unsigned int>(A.size1()); ++i)
+ v(i) = A( i, beg_ind-1);
+}
+
+
+//householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210
+/** @brief Computation of Householder vector, householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210
+ *
+ * @param A init matrix
+ * @param j start index for computations
+ * @param v output Householder vector
+ * @param b beta
+ */
+template<typename MatrixT, typename VectorT, typename NumericT>
+void householder_vector(MatrixT const & A,
+ unsigned int j,
+ VectorT & v,
+ NumericT & b)
+{
+ NumericT sg;
+
+ dot_prod(A, j+1, sg);
+ copy_vector(A, v, j+1);
+ NumericT mu;
+ v(j) = static_cast<NumericT>(1.0);
+ if (!sg)
+ b = 0;
+ else
+ {
+ mu = std::sqrt(A(j,j)*A(j, j) + sg);
+ if (A(j, j) <= 0)
+ v(j) = A(j, j) - mu;
+ else
+ v(j) = -sg/(A(j, j) + mu);
+
+ b = 2*(v(j)*v(j))/(sg + v(j)*v(j));
+ v = v/v(j);
+ }
+}
+
+
+/** @brief Inplace application of Householder vector to a matrix A
+ *
+ * @param A init matrix
+ * @param iter_cnt current iteration
+ * @param v Householder vector
+ * @param b beta
+ */
+template<typename MatrixT, typename VectorT, typename NumericT>
+void apply_householder_reflection(MatrixT & A,
+ unsigned int iter_cnt,
+ VectorT & v,
+ NumericT b)
+{
+ //update every column of matrix A
+ NumericT in_prod_res;
+
+ for (unsigned int i = iter_cnt; i < static_cast<unsigned int>(A.size2()); ++i)
+ {
+ //update each column in a fashion: ai = ai - b*v*(v'*ai)
+ custom_inner_prod(A, v, i, iter_cnt, in_prod_res);
+ for (unsigned int j = iter_cnt; j < static_cast<unsigned int>(A.size1()); ++j)
+ A(j, i) -= b*in_prod_res*v(j);
+ }
+}
+
+/** @brief Storage of vector v in column(A, ind), starting from ind-1 index of a column
+ *
+ * @param A init matrix
+ * @param ind index of a column
+ * @param v vector that should be stored
+ */
+template<typename MatrixT, typename VectorT>
+void store_householder_vector(MatrixT & A,
+ unsigned int ind,
+ VectorT & v)
+{
+ for (unsigned int i = ind; i < static_cast<unsigned int>(A.size1()); ++i)
+ A(i, ind-1) = v(i);
+}
+
+
+//QR algorithm
+/** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224
+ *
+ * @param R input matrix
+ * @param b_v vector of betas
+ */
+template<typename MatrixT, typename VectorT>
+void single_qr(MatrixT & R, VectorT & b_v)
+{
+ typedef typename MatrixT::value_type NumericType;
+
+ if ((R.size1() > 0) && (R.size2() > 0))
+ {
+ VectorT v = static_cast<VectorT>(boost::numeric::ublas::zero_vector<NumericType>(R.size1()));
+ b_v = static_cast<VectorT>(boost::numeric::ublas::zero_vector<NumericType>(R.size2()));
+
+ for (unsigned int i = 0; i < static_cast<unsigned int>(R.size2()); ++i)
+ {
+ householder_vector(R, i, v, b_v[i]);
+ apply_householder_reflection(R, i, v, b_v[i]);
+ if (i < R.size1())
+ store_householder_vector(R, i+1, v);
+ }
+ }
+}
+
+//********************** HELP FUNCTIONS FOR GPU-based QR factorization *************************//
+
+/** @brief Getting max size of rows/columns from container of index set
+ *
+ * @param inds container of index set
+ * @param max_size max size that corresponds to that container
+ */
+template<typename SizeT>
+void get_max_block_size(std::vector<std::vector<SizeT> > const & inds,
+ SizeT & max_size)
+{
+ max_size = 0;
+ for (vcl_size_t i = 0; i < inds.size(); ++i)
+ if (inds[i].size() > max_size)
+ max_size = static_cast<SizeT>(inds[i].size());
+}
+
+/** @brief Dot_prod(column(A, ind), v) starting from index ind+1
+ *
+ * @param A input matrix
+ * @param v input vector
+ * @param ind index
+ * @param res result value
+ */
+template<typename MatrixT, typename VectorT, typename NumericT>
+void custom_dot_prod(MatrixT const & A,
+ VectorT const & v,
+ unsigned int ind,
+ NumericT & res)
+{
+ res = static_cast<NumericT>(0);
+ for (unsigned int j = ind; j < A.size1(); ++j)
+ {
+ if (j == ind)
+ res += v(j);
+ else
+ res += A(j, ind)*v(j);
+ }
+}
+
+/** @brief Recovery Q from matrix R and vector of betas b_v
+ *
+ * @param R input matrix
+ * @param b_v vector of betas
+ * @param y output vector
+ */
+template<typename MatrixT, typename VectorT>
+void apply_q_trans_vec(MatrixT const & R,
+ VectorT const & b_v,
+ VectorT & y)
+{
+ typedef typename MatrixT::value_type NumericT;
+
+ NumericT inn_prod = NumericT(0);
+ for (vcl_size_t i = 0; i < R.size2(); ++i)
+ {
+ custom_dot_prod(R, y, static_cast<unsigned int>(i), inn_prod);
+ for (vcl_size_t j = i; j < R.size1(); ++j)
+ {
+ if (i == j)
+ y(j) -= b_v(i)*inn_prod;
+ else
+ y(j) -= b_v(i)*inn_prod*R(j,i);
+ }
+ }
+}
+
+/** @brief Multiplication of Q'*A, where Q is in implicit for lower part of R and vector of betas - b_v
+ *
+ * @param R input matrix
+ * @param b_v vector of betas
+ * @param A output matrix
+ */
+template<typename MatrixT, typename VectorT>
+void apply_q_trans_mat(MatrixT const & R,
+ VectorT const & b_v,
+ MatrixT & A)
+{
+ VectorT tmp_v;
+ for (vcl_size_t i = 0; i < A.size2(); ++i)
+ {
+ tmp_v = static_cast<VectorT>(column(A,i));
+ apply_q_trans_vec(R, b_v, tmp_v);
+ column(A,i) = tmp_v;
+ }
+}
+
+//parallel QR for GPU
+/** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224 performed on GPU
+ *
+ * @param g_I container of row indices
+ * @param g_J container of column indices
+ * @param g_A_I_J_vcl contigious matrices, GPU memory is used
+ * @param g_bv_vcl contigiuos vectors beta, GPU memory is used
+ * @param g_is_update container of indicators that show active blocks
+ * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename NumericT>
+void block_qr(std::vector<std::vector<unsigned int> > & g_I,
+ std::vector<std::vector<unsigned int> > & g_J,
+ block_matrix & g_A_I_J_vcl,
+ block_vector & g_bv_vcl,
+ std::vector<cl_uint> & g_is_update,
+ viennacl::context ctx)
+{
+ viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+
+ //typedef typename MatrixType::value_type ScalarType;
+ unsigned int bv_size = 0;
+ unsigned int v_size = 0;
+ //set up arguments for GPU
+ //find maximum size of rows/columns
+ unsigned int local_r_n = 0;
+ unsigned int local_c_n = 0;
+ //find max size for blocks
+ get_max_block_size(g_I, local_r_n);
+ get_max_block_size(g_J, local_c_n);
+ //get size
+ get_size(g_J, bv_size);
+ get_size(g_I, v_size);
+ //get start indices
+ std::vector<cl_uint> start_bv_inds(g_I.size() + 1, 0);
+ std::vector<cl_uint> start_v_inds(g_I.size() + 1, 0);
+ init_start_inds(g_J, start_bv_inds);
+ init_start_inds(g_I, start_v_inds);
+ //init arrays
+ std::vector<NumericT> b_v(bv_size, NumericT(0));
+ std::vector<NumericT> v(v_size, NumericT(0));
+ //call qr program
+ block_vector v_vcl;
+
+ g_bv_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(NumericT)*bv_size),
+ &(b_v[0]));
+
+ v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(NumericT)*v_size),
+ &(v[0]));
+ //the same as j_start_inds
+ g_bv_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()),
+ &(start_bv_inds[0]));
+
+ v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()),
+ &(start_v_inds[0]));
+ viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
+ &(g_is_update[0]));
+ //local memory
+ //viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));
+ viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+ viennacl::ocl::kernel & qr_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr");
+
+ qr_kernel.local_work_size(0, local_c_n);
+ qr_kernel.global_work_size(0, local_c_n*256);
+ viennacl::ocl::enqueue(qr_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle1(), g_bv_vcl.handle(),
+ v_vcl.handle(), g_A_I_J_vcl.handle2(),
+ g_bv_vcl.handle1(), v_vcl.handle1(), g_is_update_vcl,
+ viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))),
+ static_cast<cl_uint>(g_I.size())));
+
+}
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp
new file mode 100644
index 0000000..3cfdbb3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp
@@ -0,0 +1,113 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SMALL_MATRIX_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SMALL_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/small_matrix.hpp
+ @brief Implementation of a routines for small matrices (helper for SPAI). Experimental.
+
+ SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+//
+// Constructs an orthonormal sparse matrix M (with M^T M = Id). Is composed of elementary 2x2 rotation matrices with suitable renumbering.
+//
+template<typename MatrixT>
+void make_rotation_matrix(MatrixT & mat,
+ vcl_size_t new_size,
+ vcl_size_t off_diagonal_distance = 4)
+{
+ mat.resize(new_size, new_size, false);
+ mat.clear();
+
+ double val = 1.0 / std::sqrt(2.0);
+
+ for (vcl_size_t i=0; i<new_size; ++i)
+ mat(i,i) = val;
+
+ for (vcl_size_t i=off_diagonal_distance; i<new_size; ++i)
+ {
+ mat(i-off_diagonal_distance, i) = val;
+ mat(i, i-off_diagonal_distance) = -val;
+ }
+
+}
+
+
+//calcualtes matrix determinant
+template<typename MatrixT>
+double determinant(boost::numeric::ublas::matrix_expression<MatrixT> const & mat_r)
+{
+ double det = 1.0;
+
+ MatrixT mLu(mat_r());
+ boost::numeric::ublas::permutation_matrix<vcl_size_t> pivots(mat_r().size1());
+
+ int is_singular = static_cast<int>(lu_factorize(mLu, pivots));
+
+ if (!is_singular)
+ {
+ for (vcl_size_t i=0; i < pivots.size(); ++i)
+ {
+ if (pivots(i) != i)
+ det *= -1.0;
+
+ det *= mLu(i,i);
+ }
+ }
+ else
+ det = 0.0;
+
+ return det;
+}
+
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp
new file mode 100644
index 0000000..bac0b9e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp
@@ -0,0 +1,687 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/spai-dynamic.hpp
+ @brief Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental.
+
+ SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+//#include "block_matrix.hpp"
+//#include "block_vector.hpp"
+//#include "benchmark-utils.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+#include "viennacl/linalg/detail/spai/qr.hpp"
+#include "viennacl/linalg/detail/spai/spai-static.hpp"
+#include "viennacl/linalg/detail/spai/spai.hpp"
+#include "viennacl/linalg/detail/spai/spai_tag.hpp"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief Helper functor for comparing std::pair<> based on the second member. */
+struct CompareSecond
+{
+ template<typename T1, typename T2>
+ bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right)
+ {
+ return static_cast<double>(left.second) > static_cast<double>(right.second);
+ }
+};
+
+
+/** @brief Composition of new matrix R, that is going to be used in Least Square problem solving
+ *
+ * @param A matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
+ * @param R_n matrix A_Iu_J_u after QR factorization
+ * @param R previously composed matrix R
+ */
+template<typename MatrixT>
+void composeNewR(MatrixT const & A,
+ MatrixT const & R_n,
+ MatrixT & R)
+{
+ typedef typename MatrixT::value_type NumericType;
+
+ vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2());
+ MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(R.size1() + row_n, R.size2() + A.size2());
+
+ //write original R to new Composite R
+ boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
+ //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
+ boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(),
+ R.size2() + A.size2())) +=
+ boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
+
+ //adding decomposed(QR) block to Composite R
+ if (R_n.size1() > 0 && R_n.size2() > 0)
+ boost::numeric::ublas::project(C,
+ boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
+ boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
+ R = C;
+}
+
+/** @brief Composition of new vector of coefficients beta from QR factorizations(necessary for Q recovery)
+ *
+ * @param v_n new vector from last QR factorization
+ * @param v composition of previous vectors from QR factorizations
+ */
+template<typename VectorT>
+void composeNewVector(VectorT const & v_n,
+ VectorT & v)
+{
+ typedef typename VectorT::value_type NumericType;
+
+ VectorT w = boost::numeric::ublas::zero_vector<NumericType>(v.size() + v_n.size());
+ boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
+ boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
+ v = w;
+}
+
+/** @brief Computation of Euclidean norm for sparse vector
+ *
+ * @param v initial sparse vector
+ * @param norm scalar that represents Euclidean norm
+ */
+template<typename SparseVectorT, typename NumericT>
+void sparse_norm_2(SparseVectorT const & v,
+ NumericT & norm)
+{
+ for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it)
+ norm += (vec_it->second)*(vec_it->second);
+
+ norm = std::sqrt(norm);
+}
+
+/** @brief Dot product of two sparse vectors
+ *
+ * @param v1 initial sparse vector
+ * @param v2 initial sparse vector
+ * @param res_v scalar that represents dot product result
+ */
+template<typename SparseVectorT, typename NumericT>
+void sparse_inner_prod(SparseVectorT const & v1,
+ SparseVectorT const & v2,
+ NumericT & res_v)
+{
+ typename SparseVectorT::const_iterator v_it1 = v1.begin();
+ typename SparseVectorT::const_iterator v_it2 = v2.begin();
+
+ while ((v_it1 != v1.end())&&(v_it2 != v2.end()))
+ {
+ if (v_it1->first == v_it2->first)
+ {
+ res_v += (v_it1->second)*(v_it2->second);
+ ++v_it1;
+ ++v_it2;
+ }
+ else if (v_it1->first < v_it2->first)
+ ++v_it1;
+ else
+ ++v_it2;
+ }
+}
+
+/** @brief Building a new set of column indices J_u, cf. Kallischko dissertation p.31
+ *
+ * @param A_v_c vectorized column-wise initial matrix
+ * @param res residual vector
+ * @param J set of column indices
+ * @param J_u set of new column indices
+ * @param tag SPAI tag with parameters
+ */
+template<typename SparseVectorT, typename NumericT>
+bool buildAugmentedIndexSet(std::vector<SparseVectorT> const & A_v_c,
+ SparseVectorT const & res,
+ std::vector<unsigned int> & J,
+ std::vector<unsigned int> & J_u,
+ spai_tag const & tag)
+{
+ std::vector<std::pair<unsigned int, NumericT> > p;
+ vcl_size_t cur_size = 0;
+ NumericT inprod, norm2;
+
+ for (typename SparseVectorT::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it)
+ {
+ if (!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold()))
+ {
+ inprod = norm2 = 0;
+ sparse_inner_prod(res, A_v_c[res_it->first], inprod);
+ sparse_norm_2(A_v_c[res_it->first], norm2);
+ p.push_back(std::pair<unsigned int, NumericT>(res_it->first, (inprod*inprod)/(norm2*norm2)));
+ }
+ }
+
+ std::sort(p.begin(), p.end(), CompareSecond());
+ while ((cur_size < J.size()) && (p.size() > 0))
+ {
+ J_u.push_back(p[0].first);
+ p.erase(p.begin());
+ cur_size++;
+ }
+ p.clear();
+ return (cur_size > 0);
+}
+
+/** @brief Building a new indices to current set of row indices I_n, cf. Kallischko dissertation p.32
+ *
+ * @param A_v_c vectorized column-wise initial matrix
+ * @param I set of previous determined row indices
+ * @param J_n set of new column indices
+ * @param I_n set of new indices
+ */
+template<typename SparseVectorT>
+void buildNewRowSet(std::vector<SparseVectorT> const & A_v_c,
+ std::vector<unsigned int> const & I,
+ std::vector<unsigned int> const & J_n,
+ std::vector<unsigned int> & I_n)
+{
+ for (vcl_size_t i = 0; i < J_n.size(); ++i)
+ {
+ for (typename SparseVectorT::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it)
+ {
+ if (!isInIndexSet(I, col_it->first) && !isInIndexSet(I_n, col_it->first))
+ I_n.push_back(col_it->first);
+ }
+ }
+}
+
+/** @brief Composition of new block for QR factorization cf. Kallischko dissertation p.82, figure 4.7
+ *
+ * @param A_I_J previously composed block
+ * @param A_I_J_u matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
+ * @param A_I_u_J_u is composition of lower part A(I, \\tilde J) and A(\\tilde I, \\tilde J) - new block for QR decomposition
+ */
+template<typename MatrixT>
+void QRBlockComposition(MatrixT const & A_I_J,
+ MatrixT const & A_I_J_u,
+ MatrixT & A_I_u_J_u)
+{
+ typedef typename MatrixT::value_type NumericType;
+
+ vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
+ vcl_size_t row_n2 = A_I_u_J_u.size1();
+ vcl_size_t row_n = row_n1 + row_n2;
+ vcl_size_t col_n = A_I_J_u.size2();
+
+ MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(row_n, col_n);
+ boost::numeric::ublas::project(C,
+ boost::numeric::ublas::range(0, row_n1),
+ boost::numeric::ublas::range(0, col_n))
+ += boost::numeric::ublas::project(A_I_J_u,
+ boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
+ boost::numeric::ublas::range(0, col_n));
+
+ boost::numeric::ublas::project(C,
+ boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
+ boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
+ A_I_u_J_u = C;
+}
+
+/** @brief CPU-based dynamic update for SPAI preconditioner
+ *
+ * @param A initial sparse matrix
+ * @param A_v_c vectorized column-wise initial matrix
+ * @param g_res container of residuals for all columns
+ * @param g_is_update container with identificators that shows which block should be modified
+ * @param g_I container of row index sets for all columns
+ * @param g_J container of column index sets for all columns
+ * @param g_b_v container of vectors of beta for Q recovery(cf. Golub Van Loan "Matrix Computations", 3rd edition p.211)
+ * @param g_A_I_J container of block matrices from previous update
+ * @param tag SPAI configuration tag
+ */
+template<typename SparseMatrixT,
+ typename SparseVectorT,
+ typename DenseMatrixT,
+ typename VectorT>
+void block_update(SparseMatrixT const & A,
+ std::vector<SparseVectorT> const & A_v_c,
+ std::vector<SparseVectorT> & g_res,
+ std::vector<bool> & g_is_update,
+ std::vector<std::vector<unsigned int> >& g_I,
+ std::vector<std::vector<unsigned int> >& g_J,
+ std::vector<VectorT> & g_b_v,
+ std::vector<DenseMatrixT> & g_A_I_J,
+ spai_tag const & tag)
+{
+ typedef typename DenseMatrixT::value_type NumericType;
+
+
+ std::vector<std::vector<unsigned int> > g_J_u(g_J.size()); // set of new column indices
+ std::vector<std::vector<unsigned int> > g_I_u(g_J.size()); // set of new row indices
+ std::vector<DenseMatrixT> g_A_I_J_u(g_J.size()); // matrix A(I, \tilde J), cf. Kallischko p.31-32
+ std::vector<DenseMatrixT> g_A_I_u_J_u(g_J.size()); // matrix A(\tilde I, \tilde J), cf. Kallischko
+ std::vector<VectorT> g_b_v_u(g_J.size()); // new vector of beta coefficients from QR factorization
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
+ {
+ if (g_is_update[static_cast<vcl_size_t>(i)])
+ {
+ if (buildAugmentedIndexSet<SparseVectorT, NumericType>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))
+ {
+ //initialize matrix A_I_\hatJ
+ initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);
+ //multiplication of Q'*A_I_\hatJ
+ apply_q_trans_mat(g_A_I_J[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);
+ //building new rows index set \hatI
+ buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);
+ initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);
+ //composition of block for new QR factorization
+ QRBlockComposition(g_A_I_J[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);
+ //QR factorization
+ single_qr(g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_b_v_u[static_cast<vcl_size_t>(i)]);
+ //composition of new R and new vector b_v
+ composeNewR(g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_A_I_J[static_cast<vcl_size_t>(i)]);
+ composeNewVector(g_b_v_u[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)]);
+ //composition of new sets: I and J
+ g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());
+ g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());
+ }
+ else
+ {
+ g_is_update[static_cast<vcl_size_t>(i)] = false;
+ }
+ }
+ }
+}
+
+
+/**************************************************** GPU SPAI Update ****************************************************************/
+
+
+//performs Q'*A(I, \tilde J) on GPU
+/** @brief Performs multiplication Q'*A(I, \\tilde J) on GPU
+ *
+ * @param g_J_u container of sets of new column indices
+ * @param g_I container of row indices
+ * @param g_A_I_J_vcl block matrix composed from previous blocks, they are blocks of R
+ * @param g_bv_vcl block of beta vectors
+ * @param g_A_I_J_u_vcl block of matrices A(I, \\tilde J)
+ * @param g_is_update indicators, that show if a certain block should be processed
+ * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename NumericT>
+void block_q_multiplication(std::vector<std::vector<unsigned int> > const & g_J_u,
+ std::vector<std::vector<unsigned int> > const & g_I,
+ block_matrix & g_A_I_J_vcl,
+ block_vector & g_bv_vcl,
+ block_matrix & g_A_I_J_u_vcl,
+ std::vector<cl_uint> & g_is_update,
+ viennacl::context ctx)
+{
+ viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+ unsigned int local_r_n = 0;
+ unsigned int local_c_n = 0;
+ unsigned int sz_blocks = 0;
+
+ get_max_block_size(g_I, local_r_n);
+ get_max_block_size(g_J_u, local_c_n);
+
+ //for debug
+ std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+ std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+ compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
+ //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
+
+ viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+ &(g_is_update[0]));
+ viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+ viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_q_mult");
+
+ block_q_kernel.local_work_size(0, local_c_n);
+ block_q_kernel.global_work_size(0, 128*local_c_n);
+ viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
+ g_bv_vcl.handle(),
+ g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
+ viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))),
+ static_cast<cl_uint>(g_I.size())));
+}
+
+/** @brief Assembly of container of index row sets: I_q, row indices for new "QR block"
+ *
+ * @param g_I container of row indices
+ * @param g_J container of column indices
+ * @param g_I_u container of new row indices
+ * @param g_I_q container of row indices for new QR blocks
+ */
+template<typename SizeT>
+void assemble_qr_row_inds(std::vector<std::vector<SizeT> > const & g_I,
+ std::vector<std::vector<SizeT> > const & g_J,
+ std::vector<std::vector<SizeT> > const & g_I_u,
+ std::vector<std::vector<SizeT> > & g_I_q)
+{
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i = 0; i < static_cast<long>(g_I.size()); ++i)
+ {
+ for (vcl_size_t j = g_J[static_cast<vcl_size_t>(i)].size(); j < g_I[static_cast<vcl_size_t>(i)].size(); ++j)
+ g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I[static_cast<vcl_size_t>(i)][j]);
+
+ for (vcl_size_t j = 0; j < g_I_u[static_cast<vcl_size_t>(i)].size(); ++j)
+ g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I_u[static_cast<vcl_size_t>(i)][j]);
+ }
+}
+
+/** @brief Performs assembly for new QR block
+ *
+ * @param g_J container of column indices
+ * @param g_I container of row indices
+ * @param g_J_u container of new column indices
+ * @param g_I_u container of new row indices
+ * @param g_I_q container of row indices for new QR blocks
+ * @param g_A_I_J_u_vcl blocks of Q'*A(I, \\tilde J)
+ * @param matrix_dimensions array with matrix dimensions for all blocks
+ * @param g_A_I_u_J_u_vcl blocks A(\\tilde I, \\tilde J)
+ * @param g_is_update container with update indicators
+ * @param is_empty_block indicator if all previous blocks A(\\tilde I, \\tilde J) - are empty, in case if they are empty kernel with smaller number of arguments is used
+ * @param ctx Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+*/
+template<typename NumericT>
+void assemble_qr_block(std::vector<std::vector<unsigned int> > const & g_J,
+ std::vector<std::vector<unsigned int> > const& g_I,
+ std::vector<std::vector<unsigned int> > const& g_J_u,
+ std::vector<std::vector<unsigned int> > const& g_I_u,
+ std::vector<std::vector<unsigned int> >& g_I_q,
+ block_matrix & g_A_I_J_u_vcl,
+ viennacl::ocl::handle<cl_mem> & matrix_dimensions,
+ block_matrix & g_A_I_u_J_u_vcl,
+ std::vector<cl_uint> & g_is_update,
+ bool is_empty_block,
+ viennacl::context ctx)
+{
+ viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+
+ //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
+ assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
+ unsigned int sz_blocks;
+ std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+ std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+
+ compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
+
+ std::vector<NumericT> con_A_I_J_q(sz_blocks, static_cast<NumericT>(0));
+
+ block_matrix g_A_I_J_q_vcl;
+ //need to allocate memory for QR block
+ g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),
+ &(con_A_I_J_q[0]));
+ g_A_I_J_q_vcl.handle().context(opencl_ctx);
+
+ g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
+ &(matrix_dims[0]));
+ g_A_I_J_q_vcl.handle1().context(opencl_ctx);
+
+ g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),
+ &(blocks_ind[0]));
+ g_A_I_J_q_vcl.handle2().context(opencl_ctx);
+
+ viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+ &(g_is_update[0]));
+
+ viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+ if (!is_empty_block)
+ {
+ viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly");
+ qr_assembly_kernel.local_work_size(0, 1);
+ qr_assembly_kernel.global_work_size(0, 256);
+ viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions,
+ g_A_I_J_u_vcl.handle(),
+ g_A_I_J_u_vcl.handle2(),
+ g_A_I_J_u_vcl.handle1(),
+ g_A_I_u_J_u_vcl.handle(),
+ g_A_I_u_J_u_vcl.handle2(),
+ g_A_I_u_J_u_vcl.handle1(),
+ g_A_I_J_q_vcl.handle(),
+ g_A_I_J_q_vcl.handle2(),
+ g_A_I_J_q_vcl.handle1(),
+ g_is_update_vcl,
+ static_cast<unsigned int>(g_I.size())));
+ }
+ else
+ {
+ viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly_1");
+ qr_assembly_kernel.local_work_size(0, 1);
+ qr_assembly_kernel.global_work_size(0, 256);
+ viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
+ g_A_I_J_u_vcl.handle1(),
+ g_A_I_J_q_vcl.handle(),
+ g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
+ g_is_update_vcl,
+ static_cast<unsigned int>(g_I.size())));
+ }
+ g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
+ g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
+ g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
+}
+
+/** @brief Performs assembly for new R matrix on GPU
+ *
+ * @param g_I container of row indices
+ * @param g_J container of column indices
+ * @param g_A_I_J_vcl container of block matrices from previous update
+ * @param g_A_I_J_u_vcl container of block matrices Q'*A(I, \\tilde J)
+ * @param g_A_I_u_J_u_vcl container of block matrices QR factored on current iteration
+ * @param g_bv_vcl block of beta vectors from previous iteration
+ * @param g_bv_vcl_u block of updated beta vectors got after recent QR factorization
+ * @param g_is_update container with identificators that shows which block should be modified
+ * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename NumericT>
+void assemble_r(std::vector<std::vector<unsigned int> > & g_I,
+ std::vector<std::vector<unsigned int> > & g_J,
+ block_matrix & g_A_I_J_vcl,
+ block_matrix & g_A_I_J_u_vcl,
+ block_matrix & g_A_I_u_J_u_vcl,
+ block_vector & g_bv_vcl,
+ block_vector & g_bv_vcl_u,
+ std::vector<cl_uint> & g_is_update,
+ viennacl::context ctx)
+{
+ viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
+ std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+ std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+ std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
+ unsigned int sz_blocks, bv_size;
+
+ compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
+ get_size(g_J, bv_size);
+ init_start_inds(g_J, start_bv_r_inds);
+
+ std::vector<NumericT> con_A_I_J_r(sz_blocks, static_cast<NumericT>(0));
+ std::vector<NumericT> b_v_r(bv_size, static_cast<NumericT>(0));
+
+ block_matrix g_A_I_J_r_vcl;
+ block_vector g_bv_r_vcl;
+ g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),
+ &(con_A_I_J_r[0]));
+ g_A_I_J_r_vcl.handle().context(opencl_ctx);
+
+ g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
+ &(matrix_dims[0]));
+ g_A_I_J_r_vcl.handle1().context(opencl_ctx);
+
+ g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),
+ &(blocks_ind[0]));
+ g_A_I_J_r_vcl.handle2().context(opencl_ctx);
+
+ g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(NumericT)*bv_size),
+ &(b_v_r[0]));
+ g_bv_r_vcl.handle().context(opencl_ctx);
+
+ g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
+ &(start_bv_r_inds[0]));
+ g_bv_r_vcl.handle().context(opencl_ctx);
+
+ viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+ static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+ &(g_is_update[0]));
+ viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+ viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_r_assembly");
+ r_assembly_kernel.local_work_size(0, 1);
+ r_assembly_kernel.global_work_size(0, 256);
+
+ viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(),
+ g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(),
+ g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(),
+ g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
+ g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
+
+ viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_bv_assembly");
+ bv_assembly_kernel.local_work_size(0, 1);
+ bv_assembly_kernel.global_work_size(0, 256);
+ viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
+ g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
+ g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
+ static_cast<cl_uint>(g_I.size())));
+ g_bv_vcl.handle() = g_bv_r_vcl.handle();
+ g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
+
+ g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
+ g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
+ g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
+}
+
+/** @brief GPU-based block update
+ *
+ * @param A sparse matrix
+ * @param A_v_c vectorized column-wise initial matrix
+ * @param g_is_update container with identificators that shows which block should be modified
+ * @param g_res container of residuals for all columns
+ * @param g_J container of column index sets for all columns
+ * @param g_I container of row index sets for all columns
+ * @param g_A_I_J_vcl container of block matrices from previous update
+ * @param g_bv_vcl block of beta vectors from previous iteration
+ * @param tag SPAI configuration tag
+ */
+template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
+void block_update(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ std::vector<SparseVectorT> const & A_v_c,
+ std::vector<cl_uint> & g_is_update,
+ std::vector<SparseVectorT> & g_res,
+ std::vector<std::vector<unsigned int> > & g_J,
+ std::vector<std::vector<unsigned int> > & g_I,
+ block_matrix & g_A_I_J_vcl,
+ block_vector & g_bv_vcl,
+ spai_tag const & tag)
+{
+ viennacl::context ctx = viennacl::traits::context(A);
+ //updated index set for columns
+ std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
+ //updated index set for rows
+ std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
+ //mixed index set of old and updated indices for rows
+ std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
+ //GPU memory for A_I_\hatJ
+ block_matrix g_A_I_J_u_vcl;
+ //GPU memory for A_\hatI_\hatJ
+ block_matrix g_A_I_u_J_u_vcl;
+ bool is_empty_block;
+ //GPU memory for new b_v
+ block_vector g_bv_u_vcl;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
+ {
+ if (g_is_update[static_cast<vcl_size_t>(i)])
+ {
+ if (buildAugmentedIndexSet<SparseVectorT, NumericT>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))
+ buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);
+ }
+ }
+ //assemble new A_I_J_u blocks on GPU and multiply them with Q'
+ block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block);
+ //I have matrix A_I_J_u ready..
+ block_q_multiplication<NumericT>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx);
+ //assemble A_\hatI_\hatJ
+ block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block);
+ assemble_qr_block<NumericT>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
+ g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx);
+
+ block_qr<NumericT>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx);
+ //concatanation of new and old indices
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for
+#endif
+ for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
+ {
+ g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());
+ g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());
+ }
+ assemble_r<NumericT>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl, g_bv_vcl, g_bv_u_vcl, g_is_update, ctx);
+}
+
+}
+}
+}
+}
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp
new file mode 100644
index 0000000..0fd11146
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp
@@ -0,0 +1,192 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_STATIC_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_STATIC_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/spai/spai-static.hpp
+ @brief Implementation of a static SPAI. Experimental.
+
+ SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+//#include "spai-dynamic.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+
+//#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief Determines if element ind is in set {J}
+ *
+ * @param J current set
+ * @param ind current element
+ */
+template<typename SizeT>
+bool isInIndexSet(std::vector<SizeT> const & J, SizeT ind)
+{
+ return (std::find(J.begin(), J.end(), ind) != J.end());
+}
+
+
+
+/********************************* STATIC SPAI FUNCTIONS******************************************/
+
+/** @brief Projects solution of LS problem onto original column m
+ *
+ * @param m_in solution of LS
+ * @param J set of non-zero columns
+ * @param m original column of M
+ */
+template<typename VectorT, typename SparseVectorT>
+void fanOutVector(VectorT const & m_in, std::vector<unsigned int> const & J, SparseVectorT & m)
+{
+ unsigned int cnt = 0;
+ for (vcl_size_t i = 0; i < J.size(); ++i)
+ m[J[i]] = m_in(cnt++);
+}
+
+/** @brief Solution of linear:R*x=y system by backward substitution
+ *
+ * @param R uppertriangular matrix
+ * @param y right handside vector
+ * @param x solution vector
+ */
+template<typename MatrixT, typename VectorT>
+void backwardSolve(MatrixT const & R, VectorT const & y, VectorT & x)
+{
+ for (long i2 = static_cast<long>(R.size2())-1; i2 >= 0; i2--)
+ {
+ vcl_size_t i = static_cast<vcl_size_t>(i2);
+ x(i) = y(i);
+ for (vcl_size_t j = static_cast<vcl_size_t>(i)+1; j < R.size2(); ++j)
+ x(i) -= R(i,j)*x(j);
+
+ x(i) /= R(i,i);
+ }
+}
+
+/** @brief Perform projection of set I on the unit-vector
+ *
+ * @param I set of non-zero rows
+ * @param y result vector
+ * @param ind index of unit vector
+ */
+template<typename VectorT, typename NumericT>
+void projectI(std::vector<unsigned int> const & I, VectorT & y, unsigned int ind)
+{
+ for (vcl_size_t i = 0; i < I.size(); ++i)
+ {
+ //y.resize(y.size()+1);
+ if (I[i] == ind)
+ y(i) = NumericT(1.0);
+ else
+ y(i) = NumericT(0.0);
+ }
+}
+
+/** @brief Builds index set of projected columns for current column of preconditioner
+ *
+ * @param v current column of preconditioner
+ * @param J output - index set of non-zero columns
+ */
+template<typename SparseVectorT>
+void buildColumnIndexSet(SparseVectorT const & v, std::vector<unsigned int> & J)
+{
+ for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it)
+ J.push_back(vec_it->first);
+
+ std::sort(J.begin(), J.end());
+}
+
+/** @brief Initialize preconditioner with sparcity pattern = p(A)
+ *
+ * @param A input matrix
+ * @param M output matrix - initialized preconditioner
+ */
+template<typename SparseMatrixT>
+void initPreconditioner(SparseMatrixT const & A, SparseMatrixT & M)
+{
+ typedef typename SparseMatrixT::value_type NumericType;
+
+ M.resize(A.size1(), A.size2(), false);
+ for (typename SparseMatrixT::const_iterator1 row_it = A.begin1(); row_it!= A.end1(); ++row_it)
+ for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ M(col_it.index1(),col_it.index2()) = NumericType(1);
+}
+
+/** @brief Row projection for matrix A(:,J) -> A(I,J), building index set of non-zero rows
+ *
+ * @param A_v_c input matrix
+ * @param J set of non-zero rows
+ * @param I output matrix
+ */
+template<typename SparseVectorT>
+void projectRows(std::vector<SparseVectorT> const & A_v_c,
+ std::vector<unsigned int> const & J,
+ std::vector<unsigned int> & I)
+{
+ for (vcl_size_t i = 0; i < J.size(); ++i)
+ {
+ for (typename SparseVectorT::const_iterator col_it = A_v_c[J[i]].begin(); col_it!=A_v_c[J[i]].end(); ++col_it)
+ {
+ if (!isInIndexSet(I, col_it->first))
+ I.push_back(col_it->first);
+ }
+ }
+ std::sort(I.begin(), I.end());
+}
+
+
+} //namespace spai
+} //namespace detail
+} //namespace linalg
+} //namespace viennacl
+
+#endif
[16/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
new file mode 100644
index 0000000..540ff82
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
@@ -0,0 +1,33 @@
+#ifndef VIENNACL_LINALG_ILU_HPP_
+#define VIENNACL_LINALG_ILU_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ilu.hpp
+ @brief Implementations of incomplete factorization preconditioners. Convenience header file.
+*/
+
+#include "viennacl/linalg/detail/ilu/ilut.hpp"
+#include "viennacl/linalg/detail/ilu/ilu0.hpp"
+#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
+#include "viennacl/linalg/detail/ilu/chow_patel_ilu.hpp"
+
+#endif
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
new file mode 100644
index 0000000..febd347
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
@@ -0,0 +1,334 @@
+#ifndef VIENNACL_LINALG_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ilu_operations.hpp
+ @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/range.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/ilu_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+ #include "viennacl/linalg/opencl/ilu_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ #include "viennacl/linalg/cuda/ilu_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Extracts the lower triangular part L from A.
+ *
+ * Diagonal of L is stored explicitly in order to enable better code reuse.
+ *
+ */
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::extract_L(A, L);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::extract_L(A, L);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::extract_L(A, L);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L accordingly.
+ *
+ * Since A should not be modified (const-correctness), updates are in L.
+ *
+ */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::icc_scale(A, L);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::icc_scale(A, L);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::icc_scale(A, L);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC (cf. Algorithm 3 in paper, but for L rather than U)
+ *
+ * We use a fully synchronous (Jacobi-like) variant, because asynchronous methods as described in the paper are a nightmare to debug
+ * (and particularly funny if they sometimes fail, sometimes not)
+ *
+ * @param L Factor L to be updated for the incomplete Cholesky factorization
+ * @param aij_L Lower triangular potion from system matrix
+ */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> & aij_L)
+{
+ switch (viennacl::traits::handle(L).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::icc_chow_patel_sweep(L, aij_L);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::icc_chow_patel_sweep(L, aij_L);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::icc_chow_patel_sweep(L, aij_L);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+
+//////////////////////// ILU ////////////////////
+
+/** @brief Extracts the lower triangular part L and the upper triangular part U from A.
+ *
+ * Diagonals of L and U are stored explicitly in order to enable better code reuse.
+ *
+ */
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::extract_LU(A, L, U);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::extract_LU(A, L, U);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::extract_LU(A, L, U);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly.
+ *
+ * Since A should not be modified (const-correctness), updates are in L and U.
+ *
+ */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::ilu_scale(A, L, U);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::ilu_scale(A, L, U);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::ilu_scale(A, L, U);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/** @brief Transposition B <- A^T, where the aij-vector is permuted in the same way as the value array in A when assigned to B
+ *
+ * @param A Input matrix to be transposed
+ * @param B Output matrix containing the transposed matrix
+ */
+template<typename NumericT>
+void ilu_transpose(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & B)
+{
+ viennacl::context orig_ctx = viennacl::traits::context(A);
+ viennacl::context cpu_ctx(viennacl::MAIN_MEMORY);
+ (void)orig_ctx;
+ (void)cpu_ctx;
+
+ viennacl::compressed_matrix<NumericT> A_host(0, 0, 0, cpu_ctx);
+ (void)A_host;
+
+ switch (viennacl::traits::handle(A).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::ilu_transpose(A, B);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ A_host = A;
+ B.switch_memory_context(cpu_ctx);
+ viennacl::linalg::host_based::ilu_transpose(A_host, B);
+ B.switch_memory_context(orig_ctx);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ A_host = A;
+ B.switch_memory_context(cpu_ctx);
+ viennacl::linalg::host_based::ilu_transpose(A_host, B);
+ B.switch_memory_context(orig_ctx);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU (cf. Algorithm 2 in paper)
+ *
+ * We use a fully synchronous (Jacobi-like) variant, because asynchronous methods as described in the paper are a nightmare to debug
+ * (and particularly funny if they sometimes fail, sometimes not)
+ *
+ * @param L Lower-triangular matrix L in LU factorization
+ * @param aij_L Lower-triangular matrix L from A
+ * @param U_trans Upper-triangular matrix U in CSC-storage, which is the same as U^trans in CSR-storage
+ * @param aij_U_trans Upper-triangular matrix from A in CSC-storage, which is the same as U^trans in CSR-storage
+ */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L,
+ compressed_matrix<NumericT> & U_trans,
+ vector<NumericT> const & aij_U_trans)
+{
+ switch (viennacl::traits::handle(L).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/** @brief Extracts the lower triangular part L and the upper triangular part U from A.
+ *
+ * Diagonals of L and U are stored explicitly in order to enable better code reuse.
+ *
+ */
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+ vector<NumericT> & diag_R)
+{
+ switch (viennacl::traits::handle(R).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::ilu_form_neumann_matrix(R, diag_R);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::ilu_form_neumann_matrix(R, diag_R);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::ilu_form_neumann_matrix(R, diag_R);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
new file mode 100644
index 0000000..b31a82a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
@@ -0,0 +1,186 @@
+#ifndef VIENNACL_LINALG_INNER_PROD_HPP_
+#define VIENNACL_LINALG_INNER_PROD_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/inner_prod.hpp
+ @brief Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+//
+// generic inner_prod function
+// uses tag dispatch to identify which algorithm
+// should be called
+//
+namespace linalg
+{
+
+#ifdef VIENNACL_WITH_ARMADILLO
+// ----------------------------------------------------
+// Armadillo
+//
+template<typename NumericT>
+NumericT inner_prod(arma::Col<NumericT> const& v1, arma::Col<NumericT> const& v2)
+{
+ return dot(v1, v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+// ----------------------------------------------------
+// EIGEN
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+ typename VectorT1::RealScalar>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+ //std::cout << "eigen .. " << std::endl;
+ return v1.dot(v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_MTL4
+// ----------------------------------------------------
+// MTL4
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+ typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+ //std::cout << "mtl4 .. " << std::endl;
+ return mtl::dot(v1, v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_UBLAS
+// ----------------------------------------------------
+// UBLAS
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+ typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+ //std::cout << "ublas .. " << std::endl;
+ return boost::numeric::ublas::inner_prod(v1, v2);
+}
+#endif
+
+// ----------------------------------------------------
+// STL
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+ typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+ assert(v1.size() == v2.size() && bool("Vector sizes mismatch"));
+ //std::cout << "stl .. " << std::endl;
+ typename VectorT1::value_type result = 0;
+ for (typename VectorT1::size_type i=0; i<v1.size(); ++i)
+ result += v1[i] * v2[i];
+
+ return result;
+}
+
+// ----------------------------------------------------
+// VIENNACL
+//
+template<typename NumericT>
+viennacl::scalar_expression< const vector_base<NumericT>, const vector_base<NumericT>, viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & vector1,
+ vector_base<NumericT> const & vector2)
+{
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const vector_base<NumericT>,
+ const vector_base<NumericT>,
+ viennacl::op_inner_prod >(vector1, vector2);
+}
+
+
+// expression on lhs:
+template< typename LHS, typename RHS, typename OP, typename NumericT>
+viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+ const vector_base<NumericT>,
+ viennacl::op_inner_prod >
+inner_prod(viennacl::vector_expression<LHS, RHS, OP> const & vector1,
+ vector_base<NumericT> const & vector2)
+{
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+ const vector_base<NumericT>,
+ viennacl::op_inner_prod >(vector1, vector2);
+}
+
+// expression on rhs:
+template<typename NumericT, typename LHS, typename RHS, typename OP>
+viennacl::scalar_expression< const vector_base<NumericT>,
+ const viennacl::vector_expression<LHS, RHS, OP>,
+ viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & vector1,
+ viennacl::vector_expression<LHS, RHS, OP> const & vector2)
+{
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const vector_base<NumericT>,
+ const viennacl::vector_expression<LHS, RHS, OP>,
+ viennacl::op_inner_prod >(vector1, vector2);
+}
+
+// expression on lhs and rhs:
+template<typename LHS1, typename RHS1, typename OP1,
+ typename LHS2, typename RHS2, typename OP2>
+viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+ const viennacl::vector_expression<LHS2, RHS2, OP2>,
+ viennacl::op_inner_prod >
+inner_prod(viennacl::vector_expression<LHS1, RHS1, OP1> const & vector1,
+ viennacl::vector_expression<LHS2, RHS2, OP2> const & vector2)
+{
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+ const viennacl::vector_expression<LHS2, RHS2, OP2>,
+ viennacl::op_inner_prod >(vector1, vector2);
+}
+
+
+// Multiple inner products:
+template<typename NumericT>
+viennacl::vector_expression< const vector_base<NumericT>, const vector_tuple<NumericT>, viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & x,
+ vector_tuple<NumericT> const & y_tuple)
+{
+ return viennacl::vector_expression< const vector_base<NumericT>,
+ const vector_tuple<NumericT>,
+ viennacl::op_inner_prod >(x, y_tuple);
+}
+
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
new file mode 100644
index 0000000..78a813d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
@@ -0,0 +1,425 @@
+#ifndef VIENNACL_LINALG_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/iterative_operations.hpp
+ @brief Implementations of specialized routines for the iterative solvers.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/range.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/iterative_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+ #include "viennacl/linalg/opencl/iterative_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+ #include "viennacl/linalg/cuda/iterative_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for vectors 'result', 'p', 'r', 'Ap':
+ * result += alpha * p;
+ * r -= alpha * Ap;
+ * p = r + beta * p;
+ * and runs the parallel reduction stage for computing inner_prod(r,r)
+ */
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+ NumericT alpha,
+ vector_base<NumericT> & p,
+ vector_base<NumericT> & r,
+ vector_base<NumericT> const & Ap,
+ NumericT beta,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ switch (viennacl::traits::handle(result).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p' and 'Ap':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+ */
+template<typename MatrixT, typename NumericT>
+void pipelined_cg_prod(MatrixT const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> & inner_prod_buffer)
+{
+ switch (viennacl::traits::handle(p).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+////////////////////////////////////////////
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for vectors 's', 'r', 'Ap':
+ * s = r - alpha * Ap
+ * with alpha obtained from a reduction step on the 0th and the 3rd out of 6 chunks in inner_prod_buffer
+ * and runs the parallel reduction stage for computing inner_prod(s,s)
+ */
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+ vector_base<NumericT> & r,
+ vector_base<NumericT> const & Ap,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ switch (viennacl::traits::handle(s).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm.
+ *
+ * x_{j+1} = x_j + alpha * p_j + omega * s_j
+ * r_{j+1} = s_j - omega * t_j
+ * p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
+ * and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
+ */
+template<typename NumericT>
+void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+ vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+ NumericT beta, vector_base<NumericT> const & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ switch (viennacl::traits::handle(s).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+ break;
+ #ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+ break;
+ #endif
+ #ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+ break;
+ #endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p' and 'Ap':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+ */
+template<typename MatrixT, typename NumericT>
+void pipelined_bicgstab_prod(MatrixT const & A,
+ vector_base<NumericT> const & p,
+ vector_base<NumericT> & Ap,
+ vector_base<NumericT> const & r0star,
+ vector_base<NumericT> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ switch (viennacl::traits::handle(p).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+////////////////////////////////////////////
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+ *
+ * This routines computes for vectors 'r', 'v_k':
+ * Second reduction step for ||v_k||
+ * v_k /= ||v_k||
+ * First reduction step for <r, v_k>
+ */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+ vector_base<T> const & residual,
+ vector_base<T> & R_buffer,
+ vcl_size_t offset_in_R,
+ vector_base<T> const & inner_prod_buffer,
+ vector_base<T> & r_dot_vk_buffer,
+ vcl_size_t buffer_chunk_size,
+ vcl_size_t buffer_chunk_offset)
+{
+ switch (viennacl::traits::handle(v_k).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+
+/** @brief Computes the first reduction stage for multiple inner products <v_i, v_k>, i=0..k-1
+ *
+ * All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+ */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vcl_size_t k,
+ vector_base<T> & vi_in_vk_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ switch (viennacl::traits::handle(device_krylov_basis).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+/** @brief Computes the second reduction stage for multiple inner products <v_i, v_k>, i=0..k-1, then updates v_k -= <v_i, v_k> v_i and computes the first reduction stage for ||v_k||
+ *
+ * All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+ */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vcl_size_t k,
+ vector_base<T> const & vi_in_vk_buffer,
+ vector_base<T> & R_buffer,
+ vcl_size_t krylov_dim,
+ vector_base<T> & inner_prod_buffer,
+ vcl_size_t buffer_chunk_size)
+{
+ switch (viennacl::traits::handle(device_krylov_basis).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+/** @brief Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1} */
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+ vector_base<T> const & residual,
+ vector_base<T> const & krylov_basis,
+ vcl_size_t v_k_size,
+ vcl_size_t v_k_internal_size,
+ vector_base<T> const & coefficients,
+ vcl_size_t k)
+{
+ switch (viennacl::traits::handle(result).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined GMRES algorithm.
+ *
+ * This routines computes for a matrix A and vectors 'p' and 'Ap':
+ * Ap = prod(A, p);
+ * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+ */
+template <typename MatrixType, typename T>
+void pipelined_gmres_prod(MatrixType const & A,
+ vector_base<T> const & p,
+ vector_base<T> & Ap,
+ vector_base<T> & inner_prod_buffer)
+{
+ switch (viennacl::traits::handle(p).get_active_handle_id())
+ {
+ case viennacl::MAIN_MEMORY:
+ viennacl::linalg::host_based::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+ break;
+#ifdef VIENNACL_WITH_OPENCL
+ case viennacl::OPENCL_MEMORY:
+ viennacl::linalg::opencl::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+ break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+ case viennacl::CUDA_MEMORY:
+ viennacl::linalg::cuda::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+ break;
+#endif
+ case viennacl::MEMORY_NOT_INITIALIZED:
+ throw memory_exception("not initialised!");
+ default:
+ throw memory_exception("not implemented");
+ }
+}
+
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
new file mode 100644
index 0000000..0b16964
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
@@ -0,0 +1,141 @@
+#ifndef VIENNACL_LINALG_JACOBI_PRECOND_HPP_
+#define VIENNACL_LINALG_JACOBI_PRECOND_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/jacobi_precond.hpp
+ @brief Implementation of a simple Jacobi preconditioner
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/row_scaling.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for a jacobi preconditioner
+*/
+class jacobi_tag {};
+
+
+/** @brief Jacobi preconditioner class, can be supplied to solve()-routines. Generic version for non-ViennaCL matrices.
+*/
+template<typename MatrixT,
+ bool is_viennacl = detail::row_scaling_for_viennacl<MatrixT>::value >
+class jacobi_precond
+{
+ typedef typename MatrixT::value_type NumericType;
+
+ public:
+ jacobi_precond(MatrixT const & mat, jacobi_tag const &) : diag_A_(viennacl::traits::size1(mat))
+ {
+ init(mat);
+ }
+
+ void init(MatrixT const & mat)
+ {
+ diag_A_.resize(viennacl::traits::size1(mat)); //resize without preserving values
+
+ for (typename MatrixT::const_iterator1 row_it = mat.begin1();
+ row_it != mat.end1();
+ ++row_it)
+ {
+ bool diag_found = false;
+ for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ {
+ if (col_it.index1() == col_it.index2())
+ {
+ diag_A_[col_it.index1()] = *col_it;
+ diag_found = true;
+ }
+ }
+ if (!diag_found)
+ throw zero_on_diagonal_exception("ViennaCL: Zero in diagonal encountered while setting up Jacobi preconditioner!");
+ }
+ }
+
+
+ /** @brief Apply to res = b - Ax, i.e. jacobi applied vec (right hand side), */
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ assert(viennacl::traits::size(diag_A_) == viennacl::traits::size(vec) && bool("Size mismatch"));
+ for (vcl_size_t i=0; i<diag_A_.size(); ++i)
+ vec[i] /= diag_A_[i];
+ }
+
+ private:
+ std::vector<NumericType> diag_A_;
+};
+
+
+/** @brief Jacobi preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename MatrixT>
+class jacobi_precond<MatrixT, true>
+{
+ typedef typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type NumericType;
+
+ public:
+ jacobi_precond(MatrixT const & mat, jacobi_tag const &) : diag_A_(mat.size1(), viennacl::traits::context(mat))
+ {
+ init(mat);
+ }
+
+
+ void init(MatrixT const & mat)
+ {
+ detail::row_info(mat, diag_A_, detail::SPARSE_ROW_DIAGONAL);
+ }
+
+
+ template<unsigned int AlignmentV>
+ void apply(viennacl::vector<NumericType, AlignmentV> & vec) const
+ {
+ assert(viennacl::traits::size(diag_A_) == viennacl::traits::size(vec) && bool("Size mismatch"));
+ vec = element_div(vec, diag_A_);
+ }
+
+ private:
+ viennacl::vector<NumericType> diag_A_;
+};
+
+}
+}
+
+
+
+
+#endif
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
new file mode 100644
index 0000000..ffac471
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
@@ -0,0 +1,515 @@
+#ifndef VIENNACL_LINALG_LANCZOS_HPP_
+#define VIENNACL_LINALG_LANCZOS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/lanczos.hpp
+* @brief Generic interface for the Lanczos algorithm.
+*
+* Contributed by Guenther Mader and Astrid Rupp.
+*/
+
+#include <cmath>
+#include <vector>
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/io/matrix_market.hpp"
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/tools/random.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the lanczos algorithm.
+*/
+class lanczos_tag
+{
+public:
+
+ enum
+ {
+ partial_reorthogonalization = 0,
+ full_reorthogonalization,
+ no_reorthogonalization
+ };
+
+ /** @brief The constructor
+ *
+ * @param factor Exponent of epsilon - tolerance for batches of Reorthogonalization
+ * @param numeig Number of eigenvalues to be returned
+ * @param met Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
+ * @param krylov Maximum krylov-space size
+ */
+
+ lanczos_tag(double factor = 0.75,
+ vcl_size_t numeig = 10,
+ int met = 0,
+ vcl_size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {}
+
+ /** @brief Sets the number of eigenvalues */
+ void num_eigenvalues(vcl_size_t numeig){ num_eigenvalues_ = numeig; }
+
+ /** @brief Returns the number of eigenvalues */
+ vcl_size_t num_eigenvalues() const { return num_eigenvalues_; }
+
+ /** @brief Sets the exponent of epsilon. Values between 0.6 and 0.9 usually give best results. */
+ void factor(double fct) { factor_ = fct; }
+
+ /** @brief Returns the exponent */
+ double factor() const { return factor_; }
+
+ /** @brief Sets the size of the kylov space. Must be larger than number of eigenvalues to compute. */
+ void krylov_size(vcl_size_t max) { krylov_size_ = max; }
+
+ /** @brief Returns the size of the kylov space */
+ vcl_size_t krylov_size() const { return krylov_size_; }
+
+ /** @brief Sets the reorthogonalization method */
+ void method(int met){ method_ = met; }
+
+ /** @brief Returns the reorthogonalization method */
+ int method() const { return method_; }
+
+
+private:
+ double factor_;
+ vcl_size_t num_eigenvalues_;
+ int method_; // see enum defined above for possible values
+ vcl_size_t krylov_size_;
+};
+
+
+namespace detail
+{
+ /** @brief Inverse iteration for finding an eigenvector for an eigenvalue.
+ *
+ * beta[0] to be ignored for consistency.
+ */
+ template<typename NumericT>
+ void inverse_iteration(std::vector<NumericT> const & alphas, std::vector<NumericT> const & betas,
+ NumericT & eigenvalue, std::vector<NumericT> & eigenvector)
+ {
+ std::vector<NumericT> alpha_sweeped = alphas;
+ for (vcl_size_t i=0; i<alpha_sweeped.size(); ++i)
+ alpha_sweeped[i] -= eigenvalue;
+ for (vcl_size_t row=1; row < alpha_sweeped.size(); ++row)
+ alpha_sweeped[row] -= betas[row] * betas[row] / alpha_sweeped[row-1];
+
+ // starting guess: ignore last equation
+ eigenvector[alphas.size() - 1] = 1.0;
+
+ for (vcl_size_t iter=0; iter<1; ++iter)
+ {
+ // solve first n-1 equations (A - \lambda I) y = -beta[n]
+ eigenvector[alphas.size() - 1] /= alpha_sweeped[alphas.size() - 1];
+ for (vcl_size_t row2=1; row2 < alphas.size(); ++row2)
+ {
+ vcl_size_t row = alphas.size() - row2 - 1;
+ eigenvector[row] -= eigenvector[row+1] * betas[row+1];
+ eigenvector[row] /= alpha_sweeped[row];
+ }
+
+ // normalize eigenvector:
+ NumericT norm_vector = 0;
+ for (vcl_size_t i=0; i<eigenvector.size(); ++i)
+ norm_vector += eigenvector[i] * eigenvector[i];
+ norm_vector = std::sqrt(norm_vector);
+ for (vcl_size_t i=0; i<eigenvector.size(); ++i)
+ eigenvector[i] /= norm_vector;
+ }
+
+ //eigenvalue = (alphas[0] * eigenvector[0] + betas[1] * eigenvector[1]) / eigenvector[0];
+ }
+
+ /**
+ * @brief Implementation of the Lanczos PRO algorithm (partial reorthogonalization)
+ *
+ * @param A The system matrix
+ * @param r Random start vector
+ * @param eigenvectors_A Dense matrix holding the eigenvectors of A (one eigenvector per column)
+ * @param size Size of krylov-space
+ * @param tag Lanczos_tag with several options for the algorithm
+ * @param compute_eigenvectors Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+ * @return Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+ */
+
+ template<typename MatrixT, typename DenseMatrixT, typename NumericT>
+ std::vector<NumericT>
+ lanczosPRO (MatrixT const& A, vector_base<NumericT> & r, DenseMatrixT & eigenvectors_A, vcl_size_t size, lanczos_tag const & tag, bool compute_eigenvectors)
+ {
+ // generation of some random numbers, used for lanczos PRO algorithm
+ viennacl::tools::normal_random_numbers<NumericT> get_N;
+
+ std::vector<vcl_size_t> l_bound(size/2), u_bound(size/2);
+ vcl_size_t n = r.size();
+ std::vector<NumericT> w(size), w_old(size); //w_k, w_{k-1}
+
+ NumericT inner_rt;
+ std::vector<NumericT> alphas, betas;
+ viennacl::matrix<NumericT, viennacl::column_major> Q(n, size); //column-major matrix holding the Krylov basis vectors
+
+ bool second_step = false;
+ NumericT eps = std::numeric_limits<NumericT>::epsilon();
+ NumericT squ_eps = std::sqrt(eps);
+ NumericT eta = std::exp(std::log(eps) * tag.factor());
+
+ NumericT beta = viennacl::linalg::norm_2(r);
+
+ r /= beta;
+
+ viennacl::vector_base<NumericT> q_0(Q.handle(), Q.size1(), 0, 1);
+ q_0 = r;
+
+ viennacl::vector<NumericT> u = viennacl::linalg::prod(A, r);
+ NumericT alpha = viennacl::linalg::inner_prod(u, r);
+ alphas.push_back(alpha);
+ w[0] = 1;
+ betas.push_back(beta);
+
+ vcl_size_t batches = 0;
+ for (vcl_size_t i = 1; i < size; i++) // Main loop for setting up the Krylov space
+ {
+ viennacl::vector_base<NumericT> q_iminus1(Q.handle(), Q.size1(), (i-1) * Q.internal_size1(), 1);
+ r = u - alpha * q_iminus1;
+ beta = viennacl::linalg::norm_2(r);
+
+ betas.push_back(beta);
+ r = r / beta;
+
+ //
+ // Update recurrence relation for estimating orthogonality loss
+ //
+ w_old = w;
+ w[0] = (betas[1] * w_old[1] + (alphas[0] - alpha) * w_old[0] - betas[i - 1] * w_old[0]) / beta + eps * 0.3 * get_N() * (betas[1] + beta);
+ for (vcl_size_t j = 1; j < i - 1; j++)
+ w[j] = (betas[j + 1] * w_old[j + 1] + (alphas[j] - alpha) * w_old[j] + betas[j] * w_old[j - 1] - betas[i - 1] * w_old[j]) / beta + eps * 0.3 * get_N() * (betas[j + 1] + beta);
+ w[i-1] = 0.6 * eps * NumericT(n) * get_N() * betas[1] / beta;
+
+ //
+ // Check whether there has been a need for reorthogonalization detected in the previous iteration.
+ // If so, run the reorthogonalization for each batch
+ //
+ if (second_step)
+ {
+ for (vcl_size_t j = 0; j < batches; j++)
+ {
+ for (vcl_size_t k = l_bound[j] + 1; k < u_bound[j] - 1; k++)
+ {
+ viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+ inner_rt = viennacl::linalg::inner_prod(r, q_k);
+ r = r - inner_rt * q_k;
+ w[k] = 1.5 * eps * get_N();
+ }
+ }
+ NumericT temp = viennacl::linalg::norm_2(r);
+ r = r / temp;
+ beta = beta * temp;
+ second_step = false;
+ }
+ batches = 0;
+
+ //
+ // Check for semiorthogonality
+ //
+ for (vcl_size_t j = 0; j < i; j++)
+ {
+ if (std::fabs(w[j]) >= squ_eps) // tentative loss of orthonormality, hence reorthonomalize
+ {
+ viennacl::vector_base<NumericT> q_j(Q.handle(), Q.size1(), j * Q.internal_size1(), 1);
+ inner_rt = viennacl::linalg::inner_prod(r, q_j);
+ r = r - inner_rt * q_j;
+ w[j] = 1.5 * eps * get_N();
+ vcl_size_t k = j - 1;
+
+ // orthogonalization with respect to earlier basis vectors
+ while (std::fabs(w[k]) > eta)
+ {
+ viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+ inner_rt = viennacl::linalg::inner_prod(r, q_k);
+ r = r - inner_rt * q_k;
+ w[k] = 1.5 * eps * get_N();
+ if (k == 0) break;
+ k--;
+ }
+ l_bound[batches] = k;
+
+ // orthogonalization with respect to later basis vectors
+ k = j + 1;
+ while (k < i && std::fabs(w[k]) > eta)
+ {
+ viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+ inner_rt = viennacl::linalg::inner_prod(r, q_k);
+ r = r - inner_rt * q_k;
+ w[k] = 1.5 * eps * get_N();
+ k++;
+ }
+ u_bound[batches] = k - 1;
+ batches++;
+
+ j = k-1; // go to end of current batch
+ }
+ }
+
+ //
+ // Normalize basis vector and reorthogonalize as needed
+ //
+ if (batches > 0)
+ {
+ NumericT temp = viennacl::linalg::norm_2(r);
+ r = r / temp;
+ beta = beta * temp;
+ second_step = true;
+ }
+
+ // store Krylov vector in Q:
+ viennacl::vector_base<NumericT> q_i(Q.handle(), Q.size1(), i * Q.internal_size1(), 1);
+ q_i = r;
+
+ //
+ // determine and store alpha = <r, u> with u = A q_i - beta q_{i-1}
+ //
+ u = viennacl::linalg::prod(A, r);
+ u += (-beta) * q_iminus1;
+ alpha = viennacl::linalg::inner_prod(u, r);
+ alphas.push_back(alpha);
+ }
+
+ //
+ // Step 2: Compute eigenvalues of tridiagonal matrix obtained during Lanczos iterations:
+ //
+ std::vector<NumericT> eigenvalues = bisect(alphas, betas);
+
+ //
+ // Step 3: Compute eigenvectors via inverse iteration. Does not update eigenvalues, so only approximate by nature.
+ //
+ if (compute_eigenvectors)
+ {
+ std::vector<NumericT> eigenvector_tridiag(alphas.size());
+ for (std::size_t i=0; i < tag.num_eigenvalues(); ++i)
+ {
+ // compute eigenvector of tridiagonal matrix via inverse:
+ inverse_iteration(alphas, betas, eigenvalues[eigenvalues.size() - i - 1], eigenvector_tridiag);
+
+ // eigenvector w of full matrix A. Given as w = Q * u, where u is the eigenvector of the tridiagonal matrix
+ viennacl::vector<NumericT> eigenvector_u(eigenvector_tridiag.size());
+ viennacl::copy(eigenvector_tridiag, eigenvector_u);
+
+ viennacl::vector_base<NumericT> eigenvector_A(eigenvectors_A.handle(),
+ eigenvectors_A.size1(),
+ eigenvectors_A.row_major() ? i : i * eigenvectors_A.internal_size1(),
+ eigenvectors_A.row_major() ? eigenvectors_A.internal_size2() : 1);
+ eigenvector_A = viennacl::linalg::prod(project(Q,
+ range(0, Q.size1()),
+ range(0, eigenvector_u.size())),
+ eigenvector_u);
+ }
+ }
+
+ return eigenvalues;
+ }
+
+
+ /**
+ * @brief Implementation of the Lanczos FRO algorithm
+ *
+ * @param A The system matrix
+ * @param r Random start vector
+ * @param eigenvectors_A A dense matrix in which the eigenvectors of A will be stored. Both row- and column-major matrices are supported.
+ * @param krylov_dim Size of krylov-space
+ * @param tag The Lanczos tag holding tolerances, etc.
+ * @param compute_eigenvectors Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+ * @return Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+ */
+ template< typename MatrixT, typename DenseMatrixT, typename NumericT>
+ std::vector<NumericT>
+ lanczos(MatrixT const& A, vector_base<NumericT> & r, DenseMatrixT & eigenvectors_A, vcl_size_t krylov_dim, lanczos_tag const & tag, bool compute_eigenvectors)
+ {
+ std::vector<NumericT> alphas, betas;
+ viennacl::vector<NumericT> Aq(r.size());
+ viennacl::matrix<NumericT, viennacl::column_major> Q(r.size(), krylov_dim + 1); // Krylov basis (each Krylov vector is one column)
+
+ NumericT norm_r = norm_2(r);
+ NumericT beta = norm_r;
+ r /= norm_r;
+
+ // first Krylov vector:
+ viennacl::vector_base<NumericT> q0(Q.handle(), Q.size1(), 0, 1);
+ q0 = r;
+
+ //
+ // Step 1: Run Lanczos' method to obtain tridiagonal matrix
+ //
+ for (vcl_size_t i = 0; i < krylov_dim; i++)
+ {
+ betas.push_back(beta);
+ // last available vector from Krylov basis:
+ viennacl::vector_base<NumericT> q_i(Q.handle(), Q.size1(), i * Q.internal_size1(), 1);
+
+ // Lanczos algorithm:
+ // - Compute A * q:
+ Aq = viennacl::linalg::prod(A, q_i);
+
+ // - Form Aq <- Aq - <Aq, q_i> * q_i - beta * q_{i-1}, where beta is ||q_i|| before normalization in previous iteration
+ NumericT alpha = viennacl::linalg::inner_prod(Aq, q_i);
+ Aq -= alpha * q_i;
+
+ if (i > 0)
+ {
+ viennacl::vector_base<NumericT> q_iminus1(Q.handle(), Q.size1(), (i-1) * Q.internal_size1(), 1);
+ Aq -= beta * q_iminus1;
+
+ // Extra measures for improved numerical stability?
+ if (tag.method() == lanczos_tag::full_reorthogonalization)
+ {
+ // Gram-Schmidt (re-)orthogonalization:
+ // TODO: Reuse fast (pipelined) routines from GMRES or GEMV
+ for (vcl_size_t j = 0; j < i; j++)
+ {
+ viennacl::vector_base<NumericT> q_j(Q.handle(), Q.size1(), j * Q.internal_size1(), 1);
+ NumericT inner_rq = viennacl::linalg::inner_prod(Aq, q_j);
+ Aq -= inner_rq * q_j;
+ }
+ }
+ }
+
+ // normalize Aq and add to Krylov basis at column i+1 in Q:
+ beta = viennacl::linalg::norm_2(Aq);
+ viennacl::vector_base<NumericT> q_iplus1(Q.handle(), Q.size1(), (i+1) * Q.internal_size1(), 1);
+ q_iplus1 = Aq / beta;
+
+ alphas.push_back(alpha);
+ }
+
+ //
+ // Step 2: Compute eigenvalues of tridiagonal matrix obtained during Lanczos iterations:
+ //
+ std::vector<NumericT> eigenvalues = bisect(alphas, betas);
+
+ //
+ // Step 3: Compute eigenvectors via inverse iteration. Does not update eigenvalues, so only approximate by nature.
+ //
+ if (compute_eigenvectors)
+ {
+ std::vector<NumericT> eigenvector_tridiag(alphas.size());
+ for (std::size_t i=0; i < tag.num_eigenvalues(); ++i)
+ {
+ // compute eigenvector of tridiagonal matrix via inverse:
+ inverse_iteration(alphas, betas, eigenvalues[eigenvalues.size() - i - 1], eigenvector_tridiag);
+
+ // eigenvector w of full matrix A. Given as w = Q * u, where u is the eigenvector of the tridiagonal matrix
+ viennacl::vector<NumericT> eigenvector_u(eigenvector_tridiag.size());
+ viennacl::copy(eigenvector_tridiag, eigenvector_u);
+
+ viennacl::vector_base<NumericT> eigenvector_A(eigenvectors_A.handle(),
+ eigenvectors_A.size1(),
+ eigenvectors_A.row_major() ? i : i * eigenvectors_A.internal_size1(),
+ eigenvectors_A.row_major() ? eigenvectors_A.internal_size2() : 1);
+ eigenvector_A = viennacl::linalg::prod(project(Q,
+ range(0, Q.size1()),
+ range(0, eigenvector_u.size())),
+ eigenvector_u);
+ }
+ }
+
+ return eigenvalues;
+ }
+
+} // end namespace detail
+
+/**
+* @brief Implementation of the calculation of eigenvalues using lanczos (with and without reorthogonalization).
+*
+* Implementation of Lanczos with partial reorthogonalization is implemented separately.
+*
+* @param matrix The system matrix
+* @param eigenvectors_A A dense matrix in which the eigenvectors of A will be stored. Both row- and column-major matrices are supported.
+* @param tag Tag with several options for the lanczos algorithm
+* @param compute_eigenvectors Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+* @return Returns the n largest eigenvalues (n defined in the lanczos_tag)
+*/
+template<typename MatrixT, typename DenseMatrixT>
+std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type >
+eig(MatrixT const & matrix, DenseMatrixT & eigenvectors_A, lanczos_tag const & tag, bool compute_eigenvectors = true)
+{
+ typedef typename viennacl::result_of::value_type<MatrixT>::type NumericType;
+ typedef typename viennacl::result_of::cpu_value_type<NumericType>::type CPU_NumericType;
+ typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type VectorT;
+
+ viennacl::tools::uniform_random_numbers<CPU_NumericType> random_gen;
+
+ std::vector<CPU_NumericType> eigenvalues;
+ vcl_size_t matrix_size = matrix.size1();
+ VectorT r(matrix_size);
+ std::vector<CPU_NumericType> s(matrix_size);
+
+ for (vcl_size_t i=0; i<s.size(); ++i)
+ s[i] = CPU_NumericType(0.5) + random_gen();
+
+ detail::copy_vec_to_vec(s,r);
+
+ vcl_size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
+ : tag.krylov_size();
+
+ switch (tag.method())
+ {
+ case lanczos_tag::partial_reorthogonalization:
+ eigenvalues = detail::lanczosPRO(matrix, r, eigenvectors_A, size_krylov, tag, compute_eigenvectors);
+ break;
+ case lanczos_tag::full_reorthogonalization:
+ case lanczos_tag::no_reorthogonalization:
+ eigenvalues = detail::lanczos(matrix, r, eigenvectors_A, size_krylov, tag, compute_eigenvectors);
+ break;
+ }
+
+ std::vector<CPU_NumericType> largest_eigenvalues;
+
+ for (vcl_size_t i = 1; i<=tag.num_eigenvalues(); i++)
+ largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
+
+
+ return largest_eigenvalues;
+}
+
+
+/**
+* @brief Implementation of the calculation of eigenvalues using lanczos (with and without reorthogonalization).
+*
+* Implementation of Lanczos with partial reorthogonalization is implemented separately.
+*
+* @param matrix The system matrix
+* @param tag Tag with several options for the lanczos algorithm
+* @return Returns the n largest eigenvalues (n defined in the lanczos_tag)
+*/
+template<typename MatrixT>
+std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type >
+eig(MatrixT const & matrix, lanczos_tag const & tag)
+{
+ typedef typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type NumericType;
+
+ viennacl::matrix<NumericType> eigenvectors(matrix.size1(), tag.num_eigenvalues());
+ return eig(matrix, eigenvectors, tag, false);
+}
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
new file mode 100644
index 0000000..0bdd037
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
@@ -0,0 +1,227 @@
+#ifndef VIENNACL_LINALG_LU_HPP
+#define VIENNACL_LINALG_LU_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/lu.hpp
+ @brief Implementations of LU factorization for row-major and column-major dense matrices.
+*/
+
+#include <algorithm> //for std::min
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+/** @brief LU factorization of a row-major dense matrix.
+*
+* @param A The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+*/
+template<typename NumericT>
+void lu_factorize(matrix<NumericT, viennacl::row_major> & A)
+{
+ typedef matrix<NumericT, viennacl::row_major> MatrixType;
+
+ vcl_size_t max_block_size = 32;
+ vcl_size_t num_blocks = (A.size2() - 1) / max_block_size + 1;
+ std::vector<NumericT> temp_buffer(A.internal_size2() * max_block_size);
+
+ // Iterate over panels
+ for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id)
+ {
+ vcl_size_t row_start = panel_id * max_block_size;
+ vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - row_start, max_block_size);
+
+ viennacl::range block_range(row_start, row_start + current_block_size);
+ viennacl::range remainder_range(row_start + current_block_size, A.size1());
+
+ //
+ // Perform LU factorization on panel:
+ //
+
+
+ // Read from matrix to buffer:
+ viennacl::backend::memory_read(A.handle(),
+ sizeof(NumericT) * row_start * A.internal_size2(),
+ sizeof(NumericT) * current_block_size * A.internal_size2(),
+ &(temp_buffer[0]));
+
+ // Factorize (kij-version):
+ for (vcl_size_t k=0; k < current_block_size - 1; ++k)
+ {
+ for (vcl_size_t i=k+1; i < current_block_size; ++i)
+ {
+ temp_buffer[row_start + i * A.internal_size2() + k] /= temp_buffer[row_start + k * A.internal_size2() + k]; // write l_ik
+
+ NumericT l_ik = temp_buffer[row_start + i * A.internal_size2() + k];
+
+ for (vcl_size_t j = row_start + k + 1; j < A.size1(); ++j)
+ temp_buffer[i * A.internal_size2() + j] -= l_ik * temp_buffer[k * A.internal_size2() + j]; // l_ik * a_kj
+ }
+ }
+
+ // Write back:
+ viennacl::backend::memory_write(A.handle(),
+ sizeof(NumericT) * row_start * A.internal_size2(),
+ sizeof(NumericT) * current_block_size * A.internal_size2(),
+ &(temp_buffer[0]));
+
+ if (remainder_range.size() > 0)
+ {
+ //
+ // Compute L_12 = [ (U_11)^{T}^{-1} A_{21}^T ]^T
+ //
+ viennacl::matrix_range<MatrixType> U_11(A, block_range, block_range);
+ viennacl::matrix_range<MatrixType> A_21(A, remainder_range, block_range);
+ viennacl::linalg::inplace_solve(trans(U_11), trans(A_21), viennacl::linalg::lower_tag());
+
+ //
+ // Update remainder of A
+ //
+ viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range);
+ viennacl::matrix_range<MatrixType> U_12(A, block_range, remainder_range);
+ viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range);
+
+ A_22 -= viennacl::linalg::prod(L_21, U_12);
+ }
+ }
+
+}
+
+
+/** @brief LU factorization of a column-major dense matrix.
+*
+* @param A The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+*/
+template<typename NumericT>
+void lu_factorize(matrix<NumericT, viennacl::column_major> & A)
+{
+ typedef matrix<NumericT, viennacl::column_major> MatrixType;
+
+ vcl_size_t max_block_size = 32;
+ vcl_size_t num_blocks = (A.size1() - 1) / max_block_size + 1;
+ std::vector<NumericT> temp_buffer(A.internal_size1() * max_block_size);
+
+ // Iterate over panels
+ for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id)
+ {
+ vcl_size_t col_start = panel_id * max_block_size;
+ vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - col_start, max_block_size);
+
+ viennacl::range block_range(col_start, col_start + current_block_size);
+ viennacl::range remainder_range(col_start + current_block_size, A.size1());
+
+ //
+ // Perform LU factorization on panel:
+ //
+
+
+ // Read from matrix to buffer:
+ viennacl::backend::memory_read(A.handle(),
+ sizeof(NumericT) * col_start * A.internal_size1(),
+ sizeof(NumericT) * current_block_size * A.internal_size1(),
+ &(temp_buffer[0]));
+
+ // Factorize (kji-version):
+ for (vcl_size_t k=0; k < current_block_size; ++k)
+ {
+ NumericT a_kk = temp_buffer[col_start + k + k * A.internal_size1()];
+ for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i)
+ temp_buffer[i + k * A.internal_size1()] /= a_kk; // write l_ik
+
+ for (vcl_size_t j=k+1; j < current_block_size; ++j)
+ {
+ NumericT a_kj = temp_buffer[col_start + k + j * A.internal_size1()];
+ for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i)
+ temp_buffer[i + j * A.internal_size1()] -= temp_buffer[i + k * A.internal_size1()] * a_kj; // l_ik * a_kj
+ }
+ }
+
+ // Write back:
+ viennacl::backend::memory_write(A.handle(),
+ sizeof(NumericT) * col_start * A.internal_size1(),
+ sizeof(NumericT) * current_block_size * A.internal_size1(),
+ &(temp_buffer[0]));
+
+ if (remainder_range.size() > 0)
+ {
+ //
+ // Compute U_12:
+ //
+ viennacl::matrix_range<MatrixType> L_11(A, block_range, block_range);
+ viennacl::matrix_range<MatrixType> A_12(A, block_range, remainder_range);
+ viennacl::linalg::inplace_solve(L_11, A_12, viennacl::linalg::unit_lower_tag());
+
+ //
+ // Update remainder of A
+ //
+ viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range);
+ viennacl::matrix_range<MatrixType> U_12(A, block_range, remainder_range);
+ viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range);
+
+ A_22 -= viennacl::linalg::prod(L_21, U_12);
+ }
+
+ }
+
+}
+
+
+//
+// Convenience layer:
+//
+
+/** @brief LU substitution for the system LU = rhs.
+*
+* @param A The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+* @param B The matrix of load vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename F1, typename F2, unsigned int AlignmentV1, unsigned int AlignmentV2>
+void lu_substitute(matrix<NumericT, F1, AlignmentV1> const & A,
+ matrix<NumericT, F2, AlignmentV2> & B)
+{
+ assert(A.size1() == A.size2() && bool("Matrix must be square"));
+ assert(A.size1() == B.size1() && bool("Matrix must be square"));
+ inplace_solve(A, B, unit_lower_tag());
+ inplace_solve(A, B, upper_tag());
+}
+
+/** @brief LU substitution for the system LU = rhs.
+*
+* @param A The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+* @param vec The load vector, where the solution is directly written to
+*/
+template<typename NumericT, typename F, unsigned int MatAlignmentV, unsigned int VecAlignmentV>
+void lu_substitute(matrix<NumericT, F, MatAlignmentV> const & A,
+ vector<NumericT, VecAlignmentV> & vec)
+{
+ assert(A.size1() == A.size2() && bool("Matrix must be square"));
+ inplace_solve(A, vec, unit_lower_tag());
+ inplace_solve(A, vec, upper_tag());
+}
+
+}
+}
+
+#endif
[41/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/forwards.h
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/forwards.h b/native-viennaCL/src/main/cpp/viennacl/forwards.h
new file mode 100644
index 0000000..23a4580
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/forwards.h
@@ -0,0 +1,1032 @@
+#ifndef VIENNACL_FORWARDS_H
+#define VIENNACL_FORWARDS_H
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/forwards.h
+ @brief This file provides the forward declarations for the main types used within ViennaCL
+*/
+
+/**
+ @mainpage Main Page
+
+ Here you can find all the documentation on how to use the GPU-accelerated linear algebra library ViennaCL.
+ The formerly separate \ref usermanual "user manual" is no longer available as a standalone PDF, but all integrated into the HTML-based documentation.
+ Please use the navigation panel on the left to access the desired information.
+
+ Quick links:
+ - \ref manual-installation "Installation and building the examples"
+ - \ref manual-types "Basic types"
+ - \ref manual-operations "Basic operations"
+ - \ref manual-algorithms "Algorithms"
+
+
+ -----------------------------------
+ \htmlonly
+ <div style="align: right; width: 100%">
+ <a href="http://www.tuwien.ac.at/"><img src="tuwien.png"></a>
+ <a href="http://www.iue.tuwien.ac.at/"><img src="iue.png"></a>
+ <a href="http://www.asc.tuwien.ac.at/"><img src="asc.png"></a>
+ </div>
+ \endhtmlonly
+*/
+
+
+//compatibility defines:
+#ifdef VIENNACL_HAVE_UBLAS
+ #define VIENNACL_WITH_UBLAS
+#endif
+
+#ifdef VIENNACL_HAVE_EIGEN
+ #define VIENNACL_WITH_EIGEN
+#endif
+
+#ifdef VIENNACL_HAVE_MTL4
+ #define VIENNACL_WITH_MTL4
+#endif
+
+#include <cstddef>
+#include <cassert>
+#include <string>
+#include <stdexcept>
+
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/version.hpp"
+
+/** @brief Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them. */
+namespace viennacl
+{
+ typedef std::size_t vcl_size_t;
+ typedef std::ptrdiff_t vcl_ptrdiff_t;
+
+
+
+ /** @brief A tag class representing assignment */
+ struct op_assign {};
+ /** @brief A tag class representing inplace addition */
+ struct op_inplace_add {};
+ /** @brief A tag class representing inplace subtraction */
+ struct op_inplace_sub {};
+
+ /** @brief A tag class representing addition */
+ struct op_add {};
+ /** @brief A tag class representing subtraction */
+ struct op_sub {};
+ /** @brief A tag class representing multiplication by a scalar */
+ struct op_mult {};
+ /** @brief A tag class representing matrix-vector products and element-wise multiplications*/
+ struct op_prod {};
+ /** @brief A tag class representing matrix-matrix products */
+ struct op_mat_mat_prod {};
+ /** @brief A tag class representing division */
+ struct op_div {};
+ /** @brief A tag class representing the power function */
+ struct op_pow {};
+
+ /** @brief A tag class representing equality */
+ struct op_eq {};
+ /** @brief A tag class representing inequality */
+ struct op_neq {};
+ /** @brief A tag class representing greater-than */
+ struct op_greater {};
+ /** @brief A tag class representing less-than */
+ struct op_less {};
+ /** @brief A tag class representing greater-than-or-equal-to */
+ struct op_geq {};
+ /** @brief A tag class representing less-than-or-equal-to */
+ struct op_leq {};
+
+ /** @brief A tag class representing the summation of a vector */
+ struct op_sum {};
+
+ /** @brief A tag class representing the summation of all rows of a matrix */
+ struct op_row_sum {};
+
+ /** @brief A tag class representing the summation of all columns of a matrix */
+ struct op_col_sum {};
+
+ /** @brief A tag class representing element-wise casting operations on vectors and matrices */
+ template<typename OP>
+ struct op_element_cast {};
+
+ /** @brief A tag class representing element-wise binary operations (like multiplication) on vectors or matrices */
+ template<typename OP>
+ struct op_element_binary {};
+
+ /** @brief A tag class representing element-wise unary operations (like sin()) on vectors or matrices */
+ template<typename OP>
+ struct op_element_unary {};
+
+ /** @brief A tag class representing the modulus function for integers */
+ struct op_abs {};
+ /** @brief A tag class representing the acos() function */
+ struct op_acos {};
+ /** @brief A tag class representing the asin() function */
+ struct op_asin {};
+ /** @brief A tag class for representing the argmax() function */
+ struct op_argmax {};
+ /** @brief A tag class for representing the argmin() function */
+ struct op_argmin {};
+ /** @brief A tag class representing the atan() function */
+ struct op_atan {};
+ /** @brief A tag class representing the atan2() function */
+ struct op_atan2 {};
+ /** @brief A tag class representing the ceil() function */
+ struct op_ceil {};
+ /** @brief A tag class representing the cos() function */
+ struct op_cos {};
+ /** @brief A tag class representing the cosh() function */
+ struct op_cosh {};
+ /** @brief A tag class representing the exp() function */
+ struct op_exp {};
+ /** @brief A tag class representing the fabs() function */
+ struct op_fabs {};
+ /** @brief A tag class representing the fdim() function */
+ struct op_fdim {};
+ /** @brief A tag class representing the floor() function */
+ struct op_floor {};
+ /** @brief A tag class representing the fmax() function */
+ struct op_fmax {};
+ /** @brief A tag class representing the fmin() function */
+ struct op_fmin {};
+ /** @brief A tag class representing the fmod() function */
+ struct op_fmod {};
+ /** @brief A tag class representing the log() function */
+ struct op_log {};
+ /** @brief A tag class representing the log10() function */
+ struct op_log10 {};
+ /** @brief A tag class representing the sin() function */
+ struct op_sin {};
+ /** @brief A tag class representing the sinh() function */
+ struct op_sinh {};
+ /** @brief A tag class representing the sqrt() function */
+ struct op_sqrt {};
+ /** @brief A tag class representing the tan() function */
+ struct op_tan {};
+ /** @brief A tag class representing the tanh() function */
+ struct op_tanh {};
+
+ /** @brief A tag class representing the (off-)diagonal of a matrix */
+ struct op_matrix_diag {};
+
+ /** @brief A tag class representing a matrix given by a vector placed on a certain (off-)diagonal */
+ struct op_vector_diag {};
+
+ /** @brief A tag class representing the extraction of a matrix row to a vector */
+ struct op_row {};
+
+ /** @brief A tag class representing the extraction of a matrix column to a vector */
+ struct op_column {};
+
+ /** @brief A tag class representing inner products of two vectors */
+ struct op_inner_prod {};
+
+ /** @brief A tag class representing the 1-norm of a vector */
+ struct op_norm_1 {};
+
+ /** @brief A tag class representing the 2-norm of a vector */
+ struct op_norm_2 {};
+
+ /** @brief A tag class representing the inf-norm of a vector */
+ struct op_norm_inf {};
+
+ /** @brief A tag class representing the maximum of a vector */
+ struct op_max {};
+
+ /** @brief A tag class representing the minimum of a vector */
+ struct op_min {};
+
+
+ /** @brief A tag class representing the Frobenius-norm of a matrix */
+ struct op_norm_frobenius {};
+
+ /** @brief A tag class representing transposed matrices */
+ struct op_trans {};
+
+ /** @brief A tag class representing sign flips (for scalars only. Vectors and matrices use the standard multiplication by the scalar -1.0) */
+ struct op_flip_sign {};
+
+ //forward declaration of basic types:
+ template<class TYPE>
+ class scalar;
+
+ template<typename LHS, typename RHS, typename OP>
+ class scalar_expression;
+
+ template<typename SCALARTYPE>
+ class entry_proxy;
+
+ template<typename SCALARTYPE>
+ class const_entry_proxy;
+
+ template<typename LHS, typename RHS, typename OP>
+ class vector_expression;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT>
+ class vector_iterator;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT>
+ class const_vector_iterator;
+
+ template<typename SCALARTYPE>
+ class implicit_vector_base;
+
+ template<typename SCALARTYPE>
+ struct zero_vector;
+
+ template<typename SCALARTYPE>
+ struct unit_vector;
+
+ template<typename SCALARTYPE>
+ struct one_vector;
+
+ template<typename SCALARTYPE>
+ struct scalar_vector;
+
+ template<class SCALARTYPE, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
+ class vector_base;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+ class vector;
+
+ template<typename ScalarT>
+ class vector_tuple;
+
+ //the following forwards are needed for GMRES
+ template<typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+ void copy(CPU_ITERATOR const & cpu_begin,
+ CPU_ITERATOR const & cpu_end,
+ vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin);
+
+ template<typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+ void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
+ const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
+ vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin);
+
+ template<typename SCALARTYPE, unsigned int ALIGNMENT_SRC, unsigned int ALIGNMENT_DEST>
+ void copy(const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_begin,
+ const_vector_iterator<SCALARTYPE, ALIGNMENT_SRC> const & gpu_src_end,
+ const_vector_iterator<SCALARTYPE, ALIGNMENT_DEST> gpu_dest_begin);
+
+ template<typename SCALARTYPE, unsigned int ALIGNMENT, typename CPU_ITERATOR>
+ void fast_copy(const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_begin,
+ const const_vector_iterator<SCALARTYPE, ALIGNMENT> & gpu_end,
+ CPU_ITERATOR cpu_begin );
+
+ template<typename CPU_ITERATOR, typename SCALARTYPE, unsigned int ALIGNMENT>
+ void fast_copy(CPU_ITERATOR const & cpu_begin,
+ CPU_ITERATOR const & cpu_end,
+ vector_iterator<SCALARTYPE, ALIGNMENT> gpu_begin);
+
+
+ /** @brief Tag class for indicating row-major layout of a matrix. Not passed to the matrix directly, see row_major type. */
+ struct row_major_tag {};
+ /** @brief Tag class for indicating column-major layout of a matrix. Not passed to the matrix directly, see row_major type. */
+ struct column_major_tag {};
+
+ /** @brief A tag for row-major storage of a dense matrix. */
+ struct row_major
+ {
+ typedef row_major_tag orientation_category;
+
+ /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
+ *
+ * @param i row index
+ * @param j column index
+ * @param num_cols number of entries per column (including alignment)
+ */
+ static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t /* num_rows */, vcl_size_t num_cols)
+ {
+ return i * num_cols + j;
+ }
+ };
+
+ /** @brief A tag for column-major storage of a dense matrix. */
+ struct column_major
+ {
+ typedef column_major_tag orientation_category;
+
+ /** @brief Returns the memory offset for entry (i,j) of a dense matrix.
+ *
+ * @param i row index
+ * @param j column index
+ * @param num_rows number of entries per row (including alignment)
+ */
+ static vcl_size_t mem_index(vcl_size_t i, vcl_size_t j, vcl_size_t num_rows, vcl_size_t /* num_cols */)
+ {
+ return i + j * num_rows;
+ }
+ };
+
+ struct row_iteration;
+ struct col_iteration;
+
+ template<typename LHS, typename RHS, typename OP>
+ class matrix_expression;
+
+ class context;
+
+ enum memory_types
+ {
+ MEMORY_NOT_INITIALIZED
+ , MAIN_MEMORY
+ , OPENCL_MEMORY
+ , CUDA_MEMORY
+ };
+
+ namespace backend
+ {
+ class mem_handle;
+ }
+
+ //
+ // Matrix types:
+ //
+ static const vcl_size_t dense_padding_size = 128;
+
+ /** @brief A dense matrix class
+ *
+ * @tparam SCALARTYPE The underlying scalar type (either float or double)
+ * @tparam ALIGNMENT The internal memory size is given by (size()/ALIGNMENT + 1) * ALIGNMENT. ALIGNMENT must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+ */
+ template<typename ROWCOL, typename MATRIXTYPE>
+ class matrix_iterator;
+
+ template<class SCALARTYPE, typename SizeType = vcl_size_t, typename DistanceType = vcl_ptrdiff_t>
+ class matrix_base;
+
+ template<class SCALARTYPE, typename F = row_major, unsigned int ALIGNMENT = 1>
+ class matrix;
+
+ template<typename SCALARTYPE>
+ class implicit_matrix_base;
+
+ template<class SCALARTYPE>
+ class identity_matrix;
+
+ template<class SCALARTYPE>
+ class zero_matrix;
+
+ template<class SCALARTYPE>
+ class scalar_matrix;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+ class compressed_matrix;
+
+ template<class SCALARTYPE>
+ class compressed_compressed_matrix;
+
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 128>
+ class coordinate_matrix;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+ class ell_matrix;
+
+ template<typename ScalarT, typename IndexT = unsigned int>
+ class sliced_ell_matrix;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+ class hyb_matrix;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+ class circulant_matrix;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+ class hankel_matrix;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+ class toeplitz_matrix;
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT = 1>
+ class vandermonde_matrix;
+
+ //
+ // Proxies:
+ //
+ template<typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
+ class basic_range;
+
+ typedef basic_range<> range;
+
+ template<typename SizeType = vcl_size_t, typename DistanceType = std::ptrdiff_t>
+ class basic_slice;
+
+ typedef basic_slice<> slice;
+
+ template<typename VectorType>
+ class vector_range;
+
+ template<typename VectorType>
+ class vector_slice;
+
+ template<typename MatrixType>
+ class matrix_range;
+
+ template<typename MatrixType>
+ class matrix_slice;
+
+
+ /** @brief Helper struct for checking whether a type is a host scalar type (e.g. float, double) */
+ template<typename T>
+ struct is_cpu_scalar
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper struct for checking whether a type is a viennacl::scalar<> */
+ template<typename T>
+ struct is_scalar
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper struct for checking whether a type represents a sign flip on a viennacl::scalar<> */
+ template<typename T>
+ struct is_flip_sign_scalar
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper struct for checking whether the provided type represents a scalar (either host, from ViennaCL, or a flip-sign proxy) */
+ template<typename T>
+ struct is_any_scalar
+ {
+ enum { value = (is_scalar<T>::value || is_cpu_scalar<T>::value || is_flip_sign_scalar<T>::value )};
+ };
+
+ /** @brief Checks for a type being either vector_base or implicit_vector_base */
+ template<typename T>
+ struct is_any_vector { enum { value = 0 }; };
+
+ /** @brief Checks for either matrix_base or implicit_matrix_base */
+ template<typename T>
+ struct is_any_dense_matrix { enum { value = 0 }; };
+
+ /** @brief Helper class for checking whether a matrix has a row-major layout. */
+ template<typename T>
+ struct is_row_major
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether a matrix is a compressed_matrix (CSR format) */
+ template<typename T>
+ struct is_compressed_matrix
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether a matrix is a coordinate_matrix (COO format) */
+ template<typename T>
+ struct is_coordinate_matrix
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether a matrix is an ell_matrix (ELL format) */
+ template<typename T>
+ struct is_ell_matrix
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether a matrix is a sliced_ell_matrix (SELL-C-\f$ \sigma \f$ format) */
+ template<typename T>
+ struct is_sliced_ell_matrix
+ {
+ enum { value = false };
+ };
+
+
+ /** @brief Helper class for checking whether a matrix is a hyb_matrix (hybrid format: ELL plus CSR) */
+ template<typename T>
+ struct is_hyb_matrix
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether the provided type is one of the sparse matrix types (compressed_matrix, coordinate_matrix, etc.) */
+ template<typename T>
+ struct is_any_sparse_matrix
+ {
+ enum { value = false };
+ };
+
+
+ /** @brief Helper class for checking whether a matrix is a circulant matrix */
+ template<typename T>
+ struct is_circulant_matrix
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether a matrix is a Hankel matrix */
+ template<typename T>
+ struct is_hankel_matrix
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether a matrix is a Toeplitz matrix */
+ template<typename T>
+ struct is_toeplitz_matrix
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether a matrix is a Vandermonde matrix */
+ template<typename T>
+ struct is_vandermonde_matrix
+ {
+ enum { value = false };
+ };
+
+ /** @brief Helper class for checking whether the provided type is any of the dense structured matrix types (circulant, Hankel, etc.) */
+ template<typename T>
+ struct is_any_dense_structured_matrix
+ {
+ enum { value = viennacl::is_circulant_matrix<T>::value || viennacl::is_hankel_matrix<T>::value || viennacl::is_toeplitz_matrix<T>::value || viennacl::is_vandermonde_matrix<T>::value };
+ };
+
+
+
+
+ /** @brief Exception class in case of memory errors */
+ class memory_exception : public std::exception
+ {
+ public:
+ memory_exception() : message_() {}
+ memory_exception(std::string message) : message_("ViennaCL: Internal memory error: " + message) {}
+
+ virtual const char* what() const throw() { return message_.c_str(); }
+
+ virtual ~memory_exception() throw() {}
+ private:
+ std::string message_;
+ };
+
+ class cuda_not_available_exception : public std::exception
+ {
+ public:
+ cuda_not_available_exception() : message_("ViennaCL was compiled without CUDA support, but CUDA functionality required for this operation.") {}
+
+ virtual const char* what() const throw() { return message_.c_str(); }
+
+ virtual ~cuda_not_available_exception() throw() {}
+ private:
+ std::string message_;
+ };
+
+ class zero_on_diagonal_exception : public std::runtime_error
+ {
+ public:
+ zero_on_diagonal_exception(std::string const & what_arg) : std::runtime_error(what_arg) {}
+ };
+
+ class unknown_norm_exception : public std::runtime_error
+ {
+ public:
+ unknown_norm_exception(std::string const & what_arg) : std::runtime_error(what_arg) {}
+ };
+
+
+
+ namespace tools
+ {
+ //helper for matrix row/col iterators
+ //must be specialized for every viennacl matrix type
+ /** @brief Helper class for incrementing an iterator in a dense matrix. */
+ template<typename ROWCOL, typename MATRIXTYPE>
+ struct MATRIX_ITERATOR_INCREMENTER
+ {
+ typedef typename MATRIXTYPE::ERROR_SPECIALIZATION_FOR_THIS_MATRIX_TYPE_MISSING ErrorIndicator;
+
+ static void apply(const MATRIXTYPE & /*mat*/, unsigned int & /*row*/, unsigned int & /*col*/) {}
+ };
+ }
+
+ namespace linalg
+ {
+#if !defined(_MSC_VER) || defined(__CUDACC__)
+
+ template<class SCALARTYPE, unsigned int ALIGNMENT>
+ void convolve_i(viennacl::vector<SCALARTYPE, ALIGNMENT>& input1,
+ viennacl::vector<SCALARTYPE, ALIGNMENT>& input2,
+ viennacl::vector<SCALARTYPE, ALIGNMENT>& output);
+
+ template<typename T>
+ viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_prod> >
+ element_prod(vector_base<T> const & v1, vector_base<T> const & v2);
+
+ template<typename T>
+ viennacl::vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<op_div> >
+ element_div(vector_base<T> const & v1, vector_base<T> const & v2);
+
+
+
+ template<typename T>
+ void inner_prod_impl(vector_base<T> const & vec1,
+ vector_base<T> const & vec2,
+ scalar<T> & result);
+
+ template<typename LHS, typename RHS, typename OP, typename T>
+ void inner_prod_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+ vector_base<T> const & vec2,
+ scalar<T> & result);
+
+ template<typename T, typename LHS, typename RHS, typename OP>
+ void inner_prod_impl(vector_base<T> const & vec1,
+ viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+ scalar<T> & result);
+
+ template<typename LHS1, typename RHS1, typename OP1,
+ typename LHS2, typename RHS2, typename OP2, typename T>
+ void inner_prod_impl(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+ viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+ scalar<T> & result);
+
+ ///////////////////////////
+
+ template<typename T>
+ void inner_prod_cpu(vector_base<T> const & vec1,
+ vector_base<T> const & vec2,
+ T & result);
+
+ template<typename LHS, typename RHS, typename OP, typename T>
+ void inner_prod_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec1,
+ vector_base<T> const & vec2,
+ T & result);
+
+ template<typename T, typename LHS, typename RHS, typename OP>
+ void inner_prod_cpu(vector_base<T> const & vec1,
+ viennacl::vector_expression<LHS, RHS, OP> const & vec2,
+ T & result);
+
+ template<typename LHS1, typename RHS1, typename OP1,
+ typename LHS2, typename RHS2, typename OP2, typename S3>
+ void inner_prod_cpu(viennacl::vector_expression<LHS1, RHS1, OP1> const & vec1,
+ viennacl::vector_expression<LHS2, RHS2, OP2> const & vec2,
+ S3 & result);
+
+
+
+ //forward definition of norm_1_impl function
+ template<typename T>
+ void norm_1_impl(vector_base<T> const & vec, scalar<T> & result);
+
+ template<typename LHS, typename RHS, typename OP, typename T>
+ void norm_1_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ scalar<T> & result);
+
+
+ template<typename T>
+ void norm_1_cpu(vector_base<T> const & vec,
+ T & result);
+
+ template<typename LHS, typename RHS, typename OP, typename S2>
+ void norm_1_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ S2 & result);
+
+ //forward definition of norm_2_impl function
+ template<typename T>
+ void norm_2_impl(vector_base<T> const & vec, scalar<T> & result);
+
+ template<typename LHS, typename RHS, typename OP, typename T>
+ void norm_2_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ scalar<T> & result);
+
+ template<typename T>
+ void norm_2_cpu(vector_base<T> const & vec, T & result);
+
+ template<typename LHS, typename RHS, typename OP, typename S2>
+ void norm_2_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ S2 & result);
+
+
+ //forward definition of norm_inf_impl function
+ template<typename T>
+ void norm_inf_impl(vector_base<T> const & vec, scalar<T> & result);
+
+ template<typename LHS, typename RHS, typename OP, typename T>
+ void norm_inf_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ scalar<T> & result);
+
+
+ template<typename T>
+ void norm_inf_cpu(vector_base<T> const & vec, T & result);
+
+ template<typename LHS, typename RHS, typename OP, typename S2>
+ void norm_inf_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ S2 & result);
+
+ //forward definition of max()-related functions
+ template<typename T>
+ void max_impl(vector_base<T> const & vec, scalar<T> & result);
+
+ template<typename LHS, typename RHS, typename OP, typename T>
+ void max_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ scalar<T> & result);
+
+
+ template<typename T>
+ void max_cpu(vector_base<T> const & vec, T & result);
+
+ template<typename LHS, typename RHS, typename OP, typename S2>
+ void max_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ S2 & result);
+
+ //forward definition of min()-related functions
+ template<typename T>
+ void min_impl(vector_base<T> const & vec, scalar<T> & result);
+
+ template<typename LHS, typename RHS, typename OP, typename T>
+ void min_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ scalar<T> & result);
+
+
+ template<typename T>
+ void min_cpu(vector_base<T> const & vec, T & result);
+
+ template<typename LHS, typename RHS, typename OP, typename S2>
+ void min_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ S2 & result);
+
+ //forward definition of sum()-related functions
+ template<typename T>
+ void sum_impl(vector_base<T> const & vec, scalar<T> & result);
+
+ template<typename LHS, typename RHS, typename OP, typename T>
+ void sum_impl(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ scalar<T> & result);
+
+
+ template<typename T>
+ void sum_cpu(vector_base<T> const & vec, T & result);
+
+ template<typename LHS, typename RHS, typename OP, typename S2>
+ void sum_cpu(viennacl::vector_expression<LHS, RHS, OP> const & vec,
+ S2 & result);
+
+
+ // forward definition of frobenius norm:
+ template<typename T>
+ void norm_frobenius_impl(matrix_base<T> const & vec, scalar<T> & result);
+
+ template<typename T>
+ void norm_frobenius_cpu(matrix_base<T> const & vec, T & result);
+
+
+ template<typename T>
+ vcl_size_t index_norm_inf(vector_base<T> const & vec);
+
+ template<typename LHS, typename RHS, typename OP>
+ vcl_size_t index_norm_inf(viennacl::vector_expression<LHS, RHS, OP> const & vec);
+
+ //forward definition of prod_impl functions
+
+ template<typename NumericT>
+ void prod_impl(const matrix_base<NumericT> & mat,
+ const vector_base<NumericT> & vec,
+ vector_base<NumericT> & result);
+
+ template<typename NumericT>
+ void prod_impl(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & mat_trans,
+ const vector_base<NumericT> & vec,
+ vector_base<NumericT> & result);
+
+ template<typename SparseMatrixType, class SCALARTYPE, unsigned int ALIGNMENT>
+ typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value,
+ vector_expression<const SparseMatrixType,
+ const vector<SCALARTYPE, ALIGNMENT>,
+ op_prod >
+ >::type
+ prod_impl(const SparseMatrixType & mat,
+ const vector<SCALARTYPE, ALIGNMENT> & vec);
+
+ // forward definition of summation routines for matrices:
+
+ template<typename NumericT>
+ void row_sum_impl(const matrix_base<NumericT> & A,
+ vector_base<NumericT> & result);
+
+ template<typename NumericT>
+ void column_sum_impl(const matrix_base<NumericT> & A,
+ vector_base<NumericT> & result);
+
+#endif
+
+ namespace detail
+ {
+ enum row_info_types
+ {
+ SPARSE_ROW_NORM_INF = 0,
+ SPARSE_ROW_NORM_1,
+ SPARSE_ROW_NORM_2,
+ SPARSE_ROW_DIAGONAL
+ };
+
+ }
+
+
+ /** @brief A tag class representing a lower triangular matrix */
+ struct lower_tag
+ {
+ static const char * name() { return "lower"; }
+ }; //lower triangular matrix
+ /** @brief A tag class representing an upper triangular matrix */
+ struct upper_tag
+ {
+ static const char * name() { return "upper"; }
+ }; //upper triangular matrix
+ /** @brief A tag class representing a lower triangular matrix with unit diagonal*/
+ struct unit_lower_tag
+ {
+ static const char * name() { return "unit_lower"; }
+ }; //unit lower triangular matrix
+ /** @brief A tag class representing an upper triangular matrix with unit diagonal*/
+ struct unit_upper_tag
+ {
+ static const char * name() { return "unit_upper"; }
+ }; //unit upper triangular matrix
+
+ //preconditioner tags
+ class ilut_tag;
+
+ /** @brief A tag class representing the use of no preconditioner */
+ class no_precond
+ {
+ public:
+ template<typename VectorType>
+ void apply(VectorType &) const {}
+ };
+
+
+ } //namespace linalg
+
+ //
+ // More namespace comments to follow:
+ //
+
+ /** @brief Namespace providing routines for handling the different memory domains. */
+ namespace backend
+ {
+ /** @brief Provides implementations for handling memory buffers in CPU RAM. */
+ namespace cpu_ram
+ {
+ /** @brief Holds implementation details for handling memory buffers in CPU RAM. Not intended for direct use by library users. */
+ namespace detail {}
+ }
+
+ /** @brief Provides implementations for handling CUDA memory buffers. */
+ namespace cuda
+ {
+ /** @brief Holds implementation details for handling CUDA memory buffers. Not intended for direct use by library users. */
+ namespace detail {}
+ }
+
+ /** @brief Implementation details for the generic memory backend interface. */
+ namespace detail {}
+
+ /** @brief Provides implementations for handling OpenCL memory buffers. */
+ namespace opencl
+ {
+ /** @brief Holds implementation details for handling OpenCL memory buffers. Not intended for direct use by library users. */
+ namespace detail {}
+ }
+ }
+
+
+ /** @brief Holds implementation details for functionality in the main viennacl-namespace. Not intended for direct use by library users. */
+ namespace detail
+ {
+ /** @brief Helper namespace for fast Fourier transforms. Not to be used directly by library users. */
+ namespace fft
+ {
+ /** @brief Helper namespace for fast-Fourier transformation. Deprecated. */
+ namespace FFT_DATA_ORDER {}
+ }
+ }
+
+
+ /** @brief Provides an OpenCL kernel generator. */
+ namespace device_specific
+ {
+ /** @brief Provides the implementation for tuning the kernels for a particular device. */
+ namespace autotune {}
+
+ /** @brief Contains implementation details of the kernel generator. */
+ namespace detail {}
+
+ /** @brief Namespace holding the various device-specific parameters for generating the best kernels. */
+ namespace profiles {}
+
+ /** @brief Contains various helper routines for kernel generation. */
+ namespace utils {}
+ }
+
+ /** @brief Provides basic input-output functionality. */
+ namespace io
+ {
+ /** @brief Implementation details for IO functionality. Usually not of interest for a library user. */
+ namespace detail {}
+
+ /** @brief Namespace holding the various XML tag definitions for the kernel parameter tuning facility. */
+ namespace tag {}
+
+ /** @brief Namespace holding the various XML strings for the kernel parameter tuning facility. */
+ namespace val {}
+ }
+
+ /** @brief Provides all linear algebra operations which are not covered by operator overloads. */
+ namespace linalg
+ {
+ /** @brief Holds all CUDA compute kernels used by ViennaCL. */
+ namespace cuda
+ {
+ /** @brief Helper functions for the CUDA linear algebra backend. */
+ namespace detail {}
+ }
+
+ /** @brief Namespace holding implementation details for linear algebra routines. Usually not of interest for a library user. */
+ namespace detail
+ {
+ /** @brief Implementation namespace for algebraic multigrid preconditioner. */
+ namespace amg {}
+
+ /** @brief Implementation namespace for sparse approximate inverse preconditioner. */
+ namespace spai {}
+ }
+
+ /** @brief Holds all compute kernels with conventional host-based execution (buffers in CPU RAM). */
+ namespace host_based
+ {
+ /** @brief Helper functions for the host-based linear algebra backend. */
+ namespace detail {}
+ }
+
+ /** @brief Namespace containing the OpenCL kernels. Deprecated, will be moved to viennacl::linalg::opencl in future releases. */
+ namespace kernels {}
+
+ /** @brief Holds all routines providing OpenCL linear algebra operations. */
+ namespace opencl
+ {
+ /** @brief Helper functions for OpenCL-accelerated linear algebra operations. */
+ namespace detail {}
+
+ /** @brief Contains the OpenCL kernel generation functions for a predefined set of functionality. */
+ namespace kernels
+ {
+ /** @brief Implementation details for the predefined OpenCL kernels. */
+ namespace detail {}
+ }
+ }
+ }
+
+ /** @brief OpenCL backend. Manages platforms, contexts, buffers, kernels, etc. */
+ namespace ocl {}
+
+ /** @brief Namespace containing many meta-functions. */
+ namespace result_of {}
+
+ /** @brief Namespace for various tools used within ViennaCL. */
+ namespace tools
+ {
+ /** @brief Contains implementation details for the tools. Usually not of interest for the library user. */
+ namespace detail {}
+ }
+
+ /** @brief Namespace providing traits-information as well as generic wrappers to common routines for vectors and matrices such as size() or clear() */
+ namespace traits {}
+
+ /** @brief Contains the scheduling functionality which allows for dynamic kernel generation as well as the fusion of multiple statements into a single kernel. */
+ namespace scheduler
+ {
+ /** @brief Implementation details for the scheduler */
+ namespace detail {}
+
+ /** @brief Helper metafunctions used for the scheduler */
+ namespace result_of {}
+ }
+
+} //namespace viennacl
+
+#endif
+
+/*@}*/
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/hankel_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/hankel_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/hankel_matrix.hpp
new file mode 100644
index 0000000..084e6c8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/hankel_matrix.hpp
@@ -0,0 +1,343 @@
+#ifndef VIENNACL_HANKEL_MATRIX_HPP
+#define VIENNACL_HANKEL_MATRIX_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file hankel_matrix.hpp
+ @brief Implementation of the hankel_matrix class for efficient manipulation of Hankel matrices. Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+#include "viennacl/toeplitz_matrix.hpp"
+#include "viennacl/fft.hpp"
+
+#include "viennacl/linalg/hankel_matrix_operations.hpp"
+
+namespace viennacl
+{
+/** @brief A Hankel matrix class
+ *
+ * @tparam NumericT The underlying scalar type (either float or double)
+ * @tparam AlignmentV The internal memory size is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+ */
+template<class NumericT, unsigned int AlignmentV>
+class hankel_matrix
+{
+public:
+ typedef viennacl::backend::mem_handle handle_type;
+ typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType> value_type;
+
+ /**
+ * @brief The default constructor. Does not allocate any memory.
+ *
+ */
+ explicit hankel_matrix() {}
+
+ /**
+ * @brief Creates the matrix with the given size
+ *
+ * @param rows Number of rows of the matrix
+ * @param cols Number of columns of the matrix
+ */
+ explicit hankel_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows, cols)
+ {
+ assert(rows == cols && bool("Hankel matrix must be square!"));
+ (void)cols; // avoid 'unused parameter' warning in optimized builds
+ }
+
+ /** @brief Resizes the matrix.
+ * Existing entries can be preserved
+ *
+ * @param sz New size of matrix
+ * @param preserve If true, existing values are preserved.
+ */
+ void resize(vcl_size_t sz, bool preserve = true)
+ {
+ elements_.resize(sz, preserve);
+ }
+
+ /** @brief Returns the OpenCL handle
+ *
+ * @return OpenCL handle
+ */
+ handle_type const & handle() const { return elements_.handle(); }
+
+ /**
+ * @brief Returns an internal viennacl::toeplitz_matrix, which represents a Hankel matrix elements
+ *
+ */
+ toeplitz_matrix<NumericT, AlignmentV> & elements() { return elements_; }
+ toeplitz_matrix<NumericT, AlignmentV> const & elements() const { return elements_; }
+
+ /**
+ * @brief Returns the number of rows of the matrix
+ */
+ vcl_size_t size1() const { return elements_.size1(); }
+
+ /**
+ * @brief Returns the number of columns of the matrix
+ */
+ vcl_size_t size2() const { return elements_.size2(); }
+
+ /** @brief Returns the internal size of matrix representtion.
+ * Usually required for launching OpenCL kernels only
+ *
+ * @return Internal size of matrix representation
+ */
+ vcl_size_t internal_size() const { return elements_.internal_size(); }
+
+ /**
+ * @brief Read-write access to a element of the matrix
+ *
+ * @param row_index Row index of accessed element
+ * @param col_index Column index of accessed element
+ * @return Proxy for matrix entry
+ */
+ entry_proxy<NumericT> operator()(unsigned int row_index, unsigned int col_index)
+ {
+ assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
+
+ return elements_(size1() - row_index - 1, col_index);
+ }
+
+ /**
+ * @brief += operation for Hankel matrices
+ *
+ * @param that Matrix which will be added
+ * @return Result of addition
+ */
+ hankel_matrix<NumericT, AlignmentV>& operator +=(hankel_matrix<NumericT, AlignmentV>& that)
+ {
+ elements_ += that.elements();
+ return *this;
+ }
+
+private:
+ hankel_matrix(hankel_matrix const &) {}
+ hankel_matrix & operator=(hankel_matrix const & t);
+
+ toeplitz_matrix<NumericT, AlignmentV> elements_;
+};
+
+/** @brief Copies a Hankel matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU)
+ *
+ *
+ * @param cpu_vec A std::vector on the host.
+ * @param gpu_mat A hankel_matrix from ViennaCL
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(std::vector<NumericT> const & cpu_vec, hankel_matrix<NumericT, AlignmentV> & gpu_mat)
+{
+ assert((gpu_mat.size1() * 2 - 1) == cpu_vec.size() && bool("Size mismatch"));
+
+ copy(cpu_vec, gpu_mat.elements());
+}
+
+/** @brief Copies a Hankel matrix from the OpenCL device (either GPU or multi-core CPU) to the std::vector
+ *
+ *
+ * @param gpu_mat A hankel_matrix from ViennaCL
+ * @param cpu_vec A std::vector on the host.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(hankel_matrix<NumericT, AlignmentV> const & gpu_mat, std::vector<NumericT> & cpu_vec)
+{
+ assert((gpu_mat.size1() * 2 - 1) == cpu_vec.size() && bool("Size mismatch"));
+
+ copy(gpu_mat.elements(), cpu_vec);
+}
+
+/** @brief Copies a Hankel matrix from the OpenCL device (either GPU or multi-core CPU) to the matrix-like object
+ *
+ *
+ * @param han_src A hankel_matrix from ViennaCL
+ * @param com_dst A matrix-like object
+ */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(hankel_matrix<NumericT, AlignmentV> const & han_src, MatrixT& com_dst)
+{
+ assert( (viennacl::traits::size1(com_dst) == han_src.size1()) && bool("Size mismatch") );
+ assert( (viennacl::traits::size2(com_dst) == han_src.size2()) && bool("Size mismatch") );
+
+ vcl_size_t size = han_src.size1();
+ std::vector<NumericT> tmp(size * 2 - 1);
+ copy(han_src, tmp);
+
+ for (vcl_size_t i = 0; i < size; i++)
+ for (vcl_size_t j = 0; j < size; j++)
+ com_dst(i, j) = tmp[i + j];
+}
+
+/** @brief Copies a the matrix-like object to the Hankel matrix from the OpenCL device (either GPU or multi-core CPU)
+ *
+ *
+ * @param com_src A std::vector on the host
+ * @param han_dst A hankel_matrix from ViennaCL
+ */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(MatrixT const & com_src, hankel_matrix<NumericT, AlignmentV>& han_dst)
+{
+ assert( (han_dst.size1() == 0 || viennacl::traits::size1(com_src) == han_dst.size1()) && bool("Size mismatch") );
+ assert( (han_dst.size2() == 0 || viennacl::traits::size2(com_src) == han_dst.size2()) && bool("Size mismatch") );
+ assert( viennacl::traits::size2(com_src) == viennacl::traits::size1(com_src) && bool("Logic error: non-square Hankel matrix!") );
+
+ vcl_size_t size = viennacl::traits::size1(com_src);
+
+ std::vector<NumericT> tmp(2*size - 1);
+
+ for (vcl_size_t i = 0; i < size; i++)
+ tmp[i] = com_src(0, i);
+
+ for (vcl_size_t i = 1; i < size; i++)
+ tmp[size + i - 1] = com_src(size - 1, i);
+
+ viennacl::copy(tmp, han_dst);
+}
+
+/*template<typename NumericT, unsigned int AlignmentV, unsigned int VECTOR_AlignmentV>
+ void prod_impl(hankel_matrix<NumericT, AlignmentV>& mat,
+ vector<NumericT, VECTOR_AlignmentV>& vec,
+ vector<NumericT, VECTOR_AlignmentV>& result)
+ {
+ prod_impl(mat.elements(), vec, result);
+ fft::reverse(result);
+ }*/
+
+template<class NumericT, unsigned int AlignmentV>
+std::ostream & operator<<(std::ostream & s, hankel_matrix<NumericT, AlignmentV>& gpu_matrix)
+{
+ vcl_size_t size = gpu_matrix.size1();
+ std::vector<NumericT> tmp(2*size - 1);
+ copy(gpu_matrix, tmp);
+ s << "[" << size << "," << size << "](";
+
+ for (vcl_size_t i = 0; i < size; i++)
+ {
+ s << "(";
+ for (vcl_size_t j = 0; j < size; j++)
+ {
+ s << tmp[i + j];
+ //s << (int)i - (int)j;
+ if (j < (size - 1)) s << ",";
+ }
+ s << ")";
+ }
+ s << ")";
+ return s;
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+ // x = A * y
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x = A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+ lhs = temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+ lhs += temp;
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+ lhs -= temp;
+ }
+ };
+
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs());
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hankel_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs());
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs += temp_result;
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hankel_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs());
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs -= temp_result;
+ }
+ };
+
+
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+#endif // VIENNACL_HANKEL_MATRIX_HPP
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/hyb_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/hyb_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/hyb_matrix.hpp
new file mode 100644
index 0000000..e93ede5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/hyb_matrix.hpp
@@ -0,0 +1,442 @@
+#ifndef VIENNACL_HYB_MATRIX_HPP_
+#define VIENNACL_HYB_MATRIX_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/hyb_matrix.hpp
+ @brief Implementation of the hyb_matrix class
+
+ Contributed by Volodymyr Kysenko.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+
+namespace viennacl
+{
+/** @brief Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros. */
+template<typename NumericT, unsigned int AlignmentV /* see forwards.h for default argument */>
+class hyb_matrix
+{
+public:
+ typedef viennacl::backend::mem_handle handle_type;
+ typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType> value_type;
+
+ hyb_matrix() : csr_threshold_(NumericT(0.8)), rows_(0), cols_(0) {}
+
+ hyb_matrix(viennacl::context ctx) : csr_threshold_(NumericT(0.8)), rows_(0), cols_(0)
+ {
+ ell_coords_.switch_active_handle_id(ctx.memory_type());
+ ell_elements_.switch_active_handle_id(ctx.memory_type());
+
+ csr_rows_.switch_active_handle_id(ctx.memory_type());
+ csr_cols_.switch_active_handle_id(ctx.memory_type());
+ csr_elements_.switch_active_handle_id(ctx.memory_type());
+
+#ifdef VIENNACL_WITH_OPENCL
+ if (ctx.memory_type() == OPENCL_MEMORY)
+ {
+ ell_coords_.opencl_handle().context(ctx.opencl_context());
+ ell_elements_.opencl_handle().context(ctx.opencl_context());
+
+ csr_rows_.opencl_handle().context(ctx.opencl_context());
+ csr_cols_.opencl_handle().context(ctx.opencl_context());
+ csr_elements_.opencl_handle().context(ctx.opencl_context());
+ }
+#endif
+ }
+
+ /** @brief Resets all entries in the matrix back to zero without changing the matrix size. Resets the sparsity pattern. */
+ void clear()
+ {
+ // ELL part:
+ ellnnz_ = 0;
+
+ viennacl::backend::typesafe_host_array<unsigned int> host_coords_buffer(ell_coords_, internal_size1());
+ std::vector<NumericT> host_elements(internal_size1());
+
+ viennacl::backend::memory_create(ell_coords_, host_coords_buffer.element_size() * internal_size1(), viennacl::traits::context(ell_coords_), host_coords_buffer.get());
+ viennacl::backend::memory_create(ell_elements_, sizeof(NumericT) * internal_size1(), viennacl::traits::context(ell_elements_), &(host_elements[0]));
+
+ // CSR part:
+ csrnnz_ = 0;
+
+ viennacl::backend::typesafe_host_array<unsigned int> host_row_buffer(csr_rows_, rows_ + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> host_col_buffer(csr_cols_, 1);
+ host_elements.resize(1);
+
+ viennacl::backend::memory_create(csr_rows_, host_row_buffer.element_size() * (rows_ + 1), viennacl::traits::context(csr_rows_), host_row_buffer.get());
+ viennacl::backend::memory_create(csr_cols_, host_col_buffer.element_size() * 1, viennacl::traits::context(csr_cols_), host_col_buffer.get());
+ viennacl::backend::memory_create(csr_elements_, sizeof(NumericT) * 1, viennacl::traits::context(csr_elements_), &(host_elements[0]));
+ }
+
+ NumericT csr_threshold() const { return csr_threshold_; }
+ void csr_threshold(NumericT thr) { csr_threshold_ = thr; }
+
+ vcl_size_t internal_size1() const { return viennacl::tools::align_to_multiple<vcl_size_t>(rows_, AlignmentV); }
+ vcl_size_t internal_size2() const { return viennacl::tools::align_to_multiple<vcl_size_t>(cols_, AlignmentV); }
+
+ vcl_size_t size1() const { return rows_; }
+ vcl_size_t size2() const { return cols_; }
+
+ vcl_size_t internal_ellnnz() const {return viennacl::tools::align_to_multiple<vcl_size_t>(ellnnz_, AlignmentV); }
+ vcl_size_t ell_nnz() const { return ellnnz_; }
+ vcl_size_t csr_nnz() const { return csrnnz_; }
+
+ const handle_type & handle() const { return ell_elements_; }
+ const handle_type & handle2() const { return ell_coords_; }
+ const handle_type & handle3() const { return csr_rows_; }
+ const handle_type & handle4() const { return csr_cols_; }
+ const handle_type & handle5() const { return csr_elements_; }
+
+public:
+#if defined(_MSC_VER) && _MSC_VER < 1500 //Visual Studio 2005 needs special treatment
+ template<typename CPUMatrixT>
+ friend void copy(const CPUMatrixT & cpu_matrix, hyb_matrix & gpu_matrix );
+#else
+ template<typename CPUMatrixT, typename T, unsigned int ALIGN>
+ friend void copy(const CPUMatrixT & cpu_matrix, hyb_matrix<T, ALIGN> & gpu_matrix );
+#endif
+
+private:
+ NumericT csr_threshold_;
+ vcl_size_t rows_;
+ vcl_size_t cols_;
+ vcl_size_t ellnnz_;
+ vcl_size_t csrnnz_;
+
+ handle_type ell_coords_; // ell coords
+ handle_type ell_elements_; // ell elements
+
+ handle_type csr_rows_;
+ handle_type csr_cols_;
+ handle_type csr_elements_;
+};
+
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const CPUMatrixT& cpu_matrix, hyb_matrix<NumericT, AlignmentV>& gpu_matrix )
+{
+ assert( (gpu_matrix.size1() == 0 || viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (gpu_matrix.size2() == 0 || viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if (cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0)
+ {
+ //determine max capacity for row
+ vcl_size_t max_entries_per_row = 0;
+ std::vector<vcl_size_t> hist_entries(cpu_matrix.size2() + 1, 0);
+
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+ {
+ vcl_size_t num_entries = 0;
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ {
+ ++num_entries;
+ }
+
+ hist_entries[num_entries] += 1;
+ max_entries_per_row = std::max(max_entries_per_row, num_entries);
+ }
+
+ vcl_size_t sum = 0;
+ for (vcl_size_t ind = 0; ind <= max_entries_per_row; ind++)
+ {
+ sum += hist_entries[ind];
+
+ if (NumericT(sum) >= NumericT(gpu_matrix.csr_threshold()) * NumericT(cpu_matrix.size1()))
+ {
+ max_entries_per_row = ind;
+ break;
+ }
+ }
+
+ //setup GPU matrix
+ gpu_matrix.ellnnz_ = max_entries_per_row;
+ gpu_matrix.rows_ = cpu_matrix.size1();
+ gpu_matrix.cols_ = cpu_matrix.size2();
+
+ vcl_size_t nnz = gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz();
+
+ viennacl::backend::typesafe_host_array<unsigned int> ell_coords(gpu_matrix.ell_coords_, nnz);
+ viennacl::backend::typesafe_host_array<unsigned int> csr_rows(gpu_matrix.csr_rows_, cpu_matrix.size1() + 1);
+ std::vector<unsigned int> csr_cols;
+
+ std::vector<NumericT> ell_elements(nnz);
+ std::vector<NumericT> csr_elements;
+
+ vcl_size_t csr_index = 0;
+
+ for (typename CPUMatrixT::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it)
+ {
+ vcl_size_t data_index = 0;
+
+ csr_rows.set(row_it.index1(), csr_index);
+
+ for (typename CPUMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
+ {
+ if (data_index < max_entries_per_row)
+ {
+ ell_coords.set(gpu_matrix.internal_size1() * data_index + col_it.index1(), col_it.index2());
+ ell_elements[gpu_matrix.internal_size1() * data_index + col_it.index1()] = *col_it;
+ }
+ else
+ {
+ csr_cols.push_back(static_cast<unsigned int>(col_it.index2()));
+ csr_elements.push_back(*col_it);
+
+ csr_index++;
+ }
+
+ data_index++;
+ }
+
+ }
+
+ if (csr_cols.empty())
+ {
+ csr_cols.push_back(0);
+ csr_elements.push_back(0);
+ }
+
+ csr_rows.set(csr_rows.size() - 1, csr_index);
+
+ gpu_matrix.csrnnz_ = csr_cols.size();
+
+ viennacl::backend::typesafe_host_array<unsigned int> csr_cols_for_gpu(gpu_matrix.csr_cols_, csr_cols.size());
+ for (vcl_size_t i=0; i<csr_cols.size(); ++i)
+ csr_cols_for_gpu.set(i, csr_cols[i]);
+
+ viennacl::backend::memory_create(gpu_matrix.ell_coords_, ell_coords.raw_size(), traits::context(gpu_matrix.ell_coords_), ell_coords.get());
+ viennacl::backend::memory_create(gpu_matrix.ell_elements_, sizeof(NumericT) * ell_elements.size(), traits::context(gpu_matrix.ell_elements_), &(ell_elements[0]));
+
+ viennacl::backend::memory_create(gpu_matrix.csr_rows_, csr_rows.raw_size(), traits::context(gpu_matrix.csr_rows_), csr_rows.get());
+ viennacl::backend::memory_create(gpu_matrix.csr_cols_, csr_cols_for_gpu.raw_size(), traits::context(gpu_matrix.csr_cols_), csr_cols_for_gpu.get());
+ viennacl::backend::memory_create(gpu_matrix.csr_elements_, sizeof(NumericT) * csr_elements.size(), traits::context(gpu_matrix.csr_elements_), &(csr_elements[0]));
+ }
+}
+
+
+/** @brief Copies a sparse matrix from the host to the compute device. The host type is the std::vector< std::map < > > format .
+ *
+ * @param cpu_matrix A sparse matrix on the host composed of an STL vector and an STL map.
+ * @param gpu_matrix The sparse hyb_matrix from ViennaCL
+ */
+template<typename IndexT, typename NumericT, unsigned int AlignmentV>
+void copy(std::vector< std::map<IndexT, NumericT> > const & cpu_matrix,
+ hyb_matrix<NumericT, AlignmentV> & gpu_matrix)
+{
+ vcl_size_t max_col = 0;
+ for (vcl_size_t i=0; i<cpu_matrix.size(); ++i)
+ {
+ if (cpu_matrix[i].size() > 0)
+ max_col = std::max<vcl_size_t>(max_col, (cpu_matrix[i].rbegin())->first);
+ }
+
+ viennacl::copy(tools::const_sparse_matrix_adapter<NumericT, IndexT>(cpu_matrix, cpu_matrix.size(), max_col + 1), gpu_matrix);
+}
+
+
+
+
+template<typename CPUMatrixT, typename NumericT, unsigned int AlignmentV>
+void copy(const hyb_matrix<NumericT, AlignmentV>& gpu_matrix, CPUMatrixT& cpu_matrix)
+{
+ assert( (viennacl::traits::size1(cpu_matrix) == gpu_matrix.size1()) && bool("Size mismatch") );
+ assert( (viennacl::traits::size2(cpu_matrix) == gpu_matrix.size2()) && bool("Size mismatch") );
+
+ if (gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0)
+ {
+ std::vector<NumericT> ell_elements(gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
+ viennacl::backend::typesafe_host_array<unsigned int> ell_coords(gpu_matrix.handle2(), gpu_matrix.internal_size1() * gpu_matrix.internal_ellnnz());
+
+ std::vector<NumericT> csr_elements(gpu_matrix.csr_nnz());
+ viennacl::backend::typesafe_host_array<unsigned int> csr_rows(gpu_matrix.handle3(), gpu_matrix.size1() + 1);
+ viennacl::backend::typesafe_host_array<unsigned int> csr_cols(gpu_matrix.handle4(), gpu_matrix.csr_nnz());
+
+ viennacl::backend::memory_read(gpu_matrix.handle(), 0, sizeof(NumericT) * ell_elements.size(), &(ell_elements[0]));
+ viennacl::backend::memory_read(gpu_matrix.handle2(), 0, ell_coords.raw_size(), ell_coords.get());
+ viennacl::backend::memory_read(gpu_matrix.handle3(), 0, csr_rows.raw_size(), csr_rows.get());
+ viennacl::backend::memory_read(gpu_matrix.handle4(), 0, csr_cols.raw_size(), csr_cols.get());
+ viennacl::backend::memory_read(gpu_matrix.handle5(), 0, sizeof(NumericT) * csr_elements.size(), &(csr_elements[0]));
+
+
+ for (vcl_size_t row = 0; row < gpu_matrix.size1(); row++)
+ {
+ for (vcl_size_t ind = 0; ind < gpu_matrix.internal_ellnnz(); ind++)
+ {
+ vcl_size_t offset = gpu_matrix.internal_size1() * ind + row;
+
+ NumericT val = ell_elements[offset];
+ if (val <= 0 && val >= 0) // val == 0 without compiler warnings
+ continue;
+
+ if (ell_coords[offset] >= gpu_matrix.size2())
+ {
+ std::cerr << "ViennaCL encountered invalid data " << offset << " " << ind << " " << row << " " << ell_coords[offset] << " " << gpu_matrix.size2() << std::endl;
+ return;
+ }
+
+ cpu_matrix(row, ell_coords[offset]) = val;
+ }
+
+ for (vcl_size_t ind = csr_rows[row]; ind < csr_rows[row+1]; ind++)
+ {
+ NumericT val = csr_elements[ind];
+ if (val <= 0 && val >= 0) // val == 0 without compiler warnings
+ continue;
+
+ if (csr_cols[ind] >= gpu_matrix.size2())
+ {
+ std::cerr << "ViennaCL encountered invalid data " << std::endl;
+ return;
+ }
+
+ cpu_matrix(row, csr_cols[ind]) = val;
+ }
+ }
+ }
+}
+
+/** @brief Copies a sparse matrix from the compute device to the host. The host type is the std::vector< std::map < > > format .
+ *
+ * @param gpu_matrix The sparse hyb_matrix from ViennaCL
+ * @param cpu_matrix A sparse matrix on the host composed of an STL vector and an STL map.
+ */
+template<typename NumericT, unsigned int AlignmentV, typename IndexT>
+void copy(const hyb_matrix<NumericT, AlignmentV> & gpu_matrix,
+ std::vector< std::map<IndexT, NumericT> > & cpu_matrix)
+{
+ if (cpu_matrix.size() == 0)
+ cpu_matrix.resize(gpu_matrix.size1());
+
+ assert(cpu_matrix.size() == gpu_matrix.size1() && bool("Matrix dimension mismatch!"));
+
+ tools::sparse_matrix_adapter<NumericT, IndexT> temp(cpu_matrix, cpu_matrix.size(), gpu_matrix.size2());
+ viennacl::copy(gpu_matrix, temp);
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+ // x = A * y
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x = A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs = temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(0));
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x += A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs += temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), lhs, T(1));
+ }
+ };
+
+ template<typename T, unsigned int A>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+ {
+ // check for the special case x -= A * x
+ if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+ {
+ viennacl::vector<T> temp(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(1), temp, T(0));
+ lhs -= temp;
+ }
+ else
+ viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), T(-1), lhs, T(1));
+ }
+ };
+
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_assign, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs += temp_result;
+ }
+ };
+
+ // x = A * vec_op
+ template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+ struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+ {
+ static void apply(vector_base<T> & lhs, vector_expression<const hyb_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+ {
+ viennacl::vector<T> temp(rhs.rhs(), viennacl::traits::context(rhs));
+ viennacl::vector<T> temp_result(lhs);
+ viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+ lhs -= temp_result;
+ }
+ };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/io/matrix_market.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/io/matrix_market.hpp b/native-viennaCL/src/main/cpp/viennacl/io/matrix_market.hpp
new file mode 100644
index 0000000..e8444ee
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/io/matrix_market.hpp
@@ -0,0 +1,440 @@
+#ifndef VIENNACL_IO_MATRIX_MARKET_HPP
+#define VIENNACL_IO_MATRIX_MARKET_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file matrix_market.hpp
+ @brief A reader and writer for the matrix market format is implemented here
+*/
+
+#include <algorithm>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <map>
+#include <cctype>
+#include "viennacl/tools/adapter.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/fill.hpp"
+
+namespace viennacl
+{
+namespace io
+{
+//helper
+namespace detail
+{
+ inline void trim(char * buffer, long max_size)
+ {
+ //trim at beginning of string
+ long start = 0;
+ for (long i=0; i<max_size; ++i)
+ {
+ if (buffer[i] == ' ')
+ ++start;
+ else
+ break;
+ }
+
+ //trim at end of string
+ long stop = start;
+ for (long i=stop; i<max_size; ++i)
+ {
+ if (buffer[i] == 0) //end of string
+ break;
+
+ if (buffer[i] != ' ')
+ stop = i;
+ }
+
+ for (long i=0; i<=stop - start; ++i)
+ {
+ buffer[i] = buffer[start + i];
+ }
+
+ if (buffer[0] != ' ')
+ buffer[stop - start + 1] = 0; //terminate string
+ else
+ buffer[0] = 0;
+ }
+
+ inline std::string tolower(std::string & s)
+ {
+ std::transform(s.begin(), s.end(), s.begin(), static_cast < int(*)(int) > (std::tolower));
+ return s;
+ }
+
+
+
+} //namespace
+
+///////// reader ////////////
+
+/** @brief Reads a sparse or dense matrix from a file (MatrixMarket format)
+*
+* Note: If the matrix in the MatrixMarket file is complex, only the real-valued part is loaded!
+*
+* @param mat The matrix that is to be read
+* @param file Filename from which the matrix should be read
+* @param index_base The index base, typically 1
+* @tparam MatrixT A generic matrix type. Type requirements: size1() returns number of rows, size2() returns number columns, operator() writes array entries, resize() allows resizing the matrix.
+* @return Returns nonzero if file is read correctly
+*/
+template<typename MatrixT>
+long read_matrix_market_file_impl(MatrixT & mat,
+ const char * file,
+ long index_base)
+{
+ typedef typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<MatrixT>::type>::type ScalarT;
+
+ //std::cout << "Reading matrix market file" << std::endl;
+ char buffer[1025];
+ std::ifstream reader(file);
+ std::string token;
+ long linenum = 0;
+ bool symmetric = false;
+ bool dense_format = false;
+ bool is_header = true;
+ bool pattern_matrix = false;
+ //bool is_complex = false;
+ long cur_row = 0;
+ long cur_col = 0;
+ long valid_entries = 0;
+ long nnz = 0;
+
+
+ if (!reader){
+ std::cerr << "ViennaCL: Matrix Market Reader: Cannot open file " << file << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ while (reader.good())
+ {
+ // get a non-empty line
+ do
+ {
+ reader.getline(buffer, 1024);
+ ++linenum;
+ detail::trim(buffer, 1024);
+ }
+ while (reader.good() && buffer[0] == 0);
+
+ if (buffer[0] == '%')
+ {
+ if (buffer[1] == '%')
+ {
+ //parse header:
+ std::stringstream line(std::string(buffer + 2));
+ line >> token;
+ if (detail::tolower(token) != "matrixmarket")
+ {
+ std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": Expected 'MatrixMarket', got '" << token << "'" << std::endl;
+ return 0;
+ }
+
+ line >> token;
+ if (detail::tolower(token) != "matrix")
+ {
+ std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": Expected 'matrix', got '" << token << "'" << std::endl;
+ return 0;
+ }
+
+ line >> token;
+ if (detail::tolower(token) != "coordinate")
+ {
+ if (detail::tolower(token) == "array")
+ {
+ dense_format = true;
+ std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": 'array' type is not supported yet!" << std::endl;
+ return 0;
+ }
+ else
+ {
+ std::cerr << "Error in file " << file << " at line " << linenum << " in file " << file << ": Expected 'array' or 'coordinate', got '" << token << "'" << std::endl;
+ return 0;
+ }
+ }
+
+ line >> token;
+ if (detail::tolower(token) == "pattern")
+ {
+ pattern_matrix = true;
+ }
+ else if (detail::tolower(token) == "complex")
+ {
+ //is_complex = true;
+ }
+ else if (detail::tolower(token) != "real")
+ {
+ std::cerr << "Error in file " << file << ": The MatrixMarket reader provided with ViennaCL supports only real valued floating point arithmetic or pattern type matrices." << std::endl;
+ return 0;
+ }
+
+ line >> token;
+ if (detail::tolower(token) == "general"){ }
+ else if (detail::tolower(token) == "symmetric"){ symmetric = true; }
+ else
+ {
+ std::cerr << "Error in file " << file << ": The MatrixMarket reader provided with ViennaCL supports only general or symmetric matrices." << std::endl;
+ return 0;
+ }
+
+ }
+ }
+ else
+ {
+ std::stringstream line(std::stringstream::in | std::stringstream::out);
+ line << std::string(buffer);
+
+ if (is_header)
+ {
+ //read header line
+ vcl_size_t rows;
+ vcl_size_t cols;
+
+ if (line.good())
+ line >> rows;
+ else
+ {
+ std::cerr << "Error in file " << file << ": Could not get matrix dimensions (rows) in line " << linenum << std::endl;
+ return 0;
+ }
+
+ if (line.good())
+ line >> cols;
+ else
+ {
+ std::cerr << "Error in file " << file << ": Could not get matrix dimensions (columns) in line " << linenum << std::endl;
+ return 0;
+ }
+ if (!dense_format)
+ {
+ if (line.good())
+ line >> nnz;
+ else
+ {
+ std::cerr << "Error in file " << file << ": Could not get matrix dimensions (columns) in line " << linenum << std::endl;
+ return 0;
+ }
+ }
+
+ if (rows > 0 && cols > 0)
+ viennacl::traits::resize(mat, rows, cols);
+
+ is_header = false;
+ }
+ else
+ {
+ //read data
+ if (dense_format)
+ {
+ ScalarT value;
+ line >> value;
+ viennacl::traits::fill(mat, static_cast<vcl_size_t>(cur_row), static_cast<vcl_size_t>(cur_col), value);
+
+ if (++cur_row == static_cast<long>(viennacl::traits::size1(mat)))
+ {
+ //next column
+ ++cur_col;
+ cur_row = 0;
+ }
+ }
+ else //sparse format
+ {
+ long row;
+ long col;
+ ScalarT value = ScalarT(1);
+
+ //parse data:
+ if (line.good())
+ line >> row;
+ else
+ {
+ std::cerr << "Error in file " << file << ": Parse error for matrix row entry in line " << linenum << std::endl;
+ return 0;
+ }
+
+ if (line.good())
+ line >> col;
+ else
+ {
+ std::cerr << "Error in file " << file << ": Parse error for matrix col entry in line " << linenum << std::endl;
+ return 0;
+ }
+
+ //take index_base base into account:
+ row -= index_base;
+ col -= index_base;
+
+ if (!pattern_matrix) // value for pattern matrix is implicitly 1, so we only need to read data for 'normal' matrices
+ {
+ if (line.good())
+ {
+ line >> value;
+ }
+ else
+ {
+ std::cerr << "Error in file " << file << ": Parse error for matrix entry in line " << linenum << std::endl;
+ return 0;
+ }
+ }
+
+ if (row >= static_cast<long>(viennacl::traits::size1(mat)) || row < 0)
+ {
+ std::cerr << "Error in file " << file << " at line " << linenum << ": Row index out of bounds: " << row << " (matrix dim: " << viennacl::traits::size1(mat) << " x " << viennacl::traits::size2(mat) << ")" << std::endl;
+ return 0;
+ }
+
+ if (col >= static_cast<long>(viennacl::traits::size2(mat)) || col < 0)
+ {
+ std::cerr << "Error in file " << file << " at line " << linenum << ": Column index out of bounds: " << col << " (matrix dim: " << viennacl::traits::size1(mat) << " x " << viennacl::traits::size2(mat) << ")" << std::endl;
+ return 0;
+ }
+
+ viennacl::traits::fill(mat, static_cast<vcl_size_t>(row), static_cast<vcl_size_t>(col), value); //basically equivalent to mat(row, col) = value;
+ if (symmetric)
+ viennacl::traits::fill(mat, static_cast<vcl_size_t>(col), static_cast<vcl_size_t>(row), value); //basically equivalent to mat(col, row) = value;
+
+ if (++valid_entries == nnz)
+ break;
+
+ } //else dense_format
+ }
+ }
+ }
+
+ //std::cout << linenum << " lines read." << std::endl;
+ reader.close();
+ return linenum;
+}
+
+
+/** @brief Reads a sparse matrix from a file (MatrixMarket format)
+*
+* @param mat The matrix that is to be read (ublas-types and std::vector< std::map <unsigned int, ScalarT> > are supported)
+* @param file The filename
+* @param index_base The index base, typically 1
+* @tparam MatrixT A generic matrix type. Type requirements: size1() returns number of rows, size2() returns number columns, operator() writes array entries, resize() allows resizing the matrix.
+* @return Returns nonzero if file is read correctly
+*/
+template<typename MatrixT>
+long read_matrix_market_file(MatrixT & mat,
+ const char * file,
+ long index_base = 1)
+{
+ return read_matrix_market_file_impl(mat, file, index_base);
+}
+
+template<typename MatrixT>
+long read_matrix_market_file(MatrixT & mat,
+ const std::string & file,
+ long index_base = 1)
+{
+ return read_matrix_market_file_impl(mat, file.c_str(), index_base);
+}
+
+template<typename ScalarT>
+long read_matrix_market_file(std::vector< std::map<unsigned int, ScalarT> > & mat,
+ const char * file,
+ long index_base = 1)
+{
+ viennacl::tools::sparse_matrix_adapter<ScalarT> adapted_matrix(mat);
+ return read_matrix_market_file_impl(adapted_matrix, file, index_base);
+}
+
+template<typename ScalarT>
+long read_matrix_market_file(std::vector< std::map<unsigned int, ScalarT> > & mat,
+ const std::string & file,
+ long index_base = 1)
+{
+ viennacl::tools::sparse_matrix_adapter<ScalarT> adapted_matrix(mat);
+ return read_matrix_market_file_impl(adapted_matrix, file.c_str(), index_base);
+}
+
+
+////////// writer /////////////
+template<typename MatrixT>
+void write_matrix_market_file_impl(MatrixT const & mat, const char * file, long index_base)
+{
+ std::ofstream writer(file);
+
+ long num_entries = 0;
+ for (typename MatrixT::const_iterator1 row_it = mat.begin1();
+ row_it != mat.end1();
+ ++row_it)
+ for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ ++num_entries;
+
+ writer << "%%MatrixMarket matrix coordinate real general" << std::endl;
+ writer << mat.size1() << " " << mat.size2() << " " << num_entries << std::endl;
+
+ for (typename MatrixT::const_iterator1 row_it = mat.begin1();
+ row_it != mat.end1();
+ ++row_it)
+ for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+ col_it != row_it.end();
+ ++col_it)
+ writer << col_it.index1() + index_base << " " << col_it.index2() + index_base << " " << *col_it << std::endl;
+
+ writer.close();
+}
+
+template<typename ScalarT>
+void write_matrix_market_file(std::vector< std::map<unsigned int, ScalarT> > const & mat,
+ const char * file,
+ long index_base = 1)
+{
+ viennacl::tools::const_sparse_matrix_adapter<ScalarT> adapted_matrix(mat);
+ return write_matrix_market_file_impl(adapted_matrix, file, index_base);
+}
+
+template<typename ScalarT>
+void write_matrix_market_file(std::vector< std::map<unsigned int, ScalarT> > const & mat,
+ const std::string & file,
+ long index_base = 1)
+{
+ viennacl::tools::const_sparse_matrix_adapter<ScalarT> adapted_matrix(mat);
+ return write_matrix_market_file_impl(adapted_matrix, file.c_str(), index_base);
+}
+
+/** @brief Writes a sparse matrix to a file (MatrixMarket format)
+*
+* @param mat The matrix that is to be read (ublas-types and std::vector< std::map <unsigned int, ScalarT> > are supported)
+* @param file The filename
+* @param index_base The index base, typically 1
+* @tparam MatrixT A generic matrix type. Type requirements: size1() returns number of rows, size2() returns number columns, operator() writes array entries, resize() allows resizing the matrix.
+* @return Returns nonzero if file is read correctly
+*/
+template<typename MatrixT>
+void write_matrix_market_file(MatrixT const & mat,
+ const std::string & file,
+ long index_base = 1)
+{
+ write_matrix_market_file_impl(mat, file.c_str(), index_base);
+}
+
+
+} //namespace io
+} //namespace viennacl
+
+#endif
[12/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
new file mode 100644
index 0000000..64c12b0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
@@ -0,0 +1,2645 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
+#define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/opencl/kernels/bisect.hpp
+ @brief OpenCL kernels for the bisection algorithm for eigenvalues
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+// declaration, forward
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+ template <typename StringType>
+ void generate_bisect_kernel_config(StringType & source)
+ {
+ /* Global configuration parameter */
+ source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK 256\n");
+ source.append(" #define VIENNACL_BISECT_MAX_SMALL_MATRIX 256\n");
+ source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX 256\n");
+ source.append(" #define VIENNACL_BISECT_MIN_ABS_INTERVAL 5.0e-37\n");
+
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute the next lower power of two of n
+ // n number for which next higher power of two is seeked
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_floorPow2(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline int \n");
+ source.append(" floorPow2(int n) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ // early out if already power of two
+ source.append(" if (0 == (n & (n-1))) \n");
+ source.append(" { \n");
+ source.append(" return n; \n");
+ source.append(" } \n");
+
+ source.append(" int exp; \n");
+ source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n");
+ source.append(" return (1 << (exp - 1)); \n");
+ source.append(" } \n");
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute the next higher power of two of n
+ // n number for which next higher power of two is seeked
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_ceilPow2(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline int \n");
+ source.append(" ceilPow2(int n) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ // early out if already power of two
+ source.append(" if (0 == (n & (n-1))) \n");
+ source.append(" { \n");
+ source.append(" return n; \n");
+ source.append(" } \n");
+
+ source.append(" int exp; \n");
+ source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n");
+ source.append(" return (1 << exp); \n");
+ source.append(" } \n");
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute midpoint of interval [\a left, \a right] avoiding overflow if possible
+ //
+ // left left / lower limit of interval
+ // right right / upper limit of interval
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_computeMidpoint(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline "); source.append(numeric_string); source.append(" \n");
+ source.append(" computeMidpoint(const "); source.append(numeric_string); source.append(" left,\n");
+ source.append(" const "); source.append(numeric_string); source.append(" right) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+ source.append(" "); source.append(numeric_string); source.append(" mid; \n");
+
+ source.append(" if (sign(left) == sign(right)) \n");
+ source.append(" { \n");
+ source.append(" mid = left + (right - left) * 0.5f; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" mid = (left + right) * 0.5f; \n");
+ source.append(" } \n");
+
+ source.append(" return mid; \n");
+ source.append(" } \n");
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Check if interval converged and store appropriately
+ //
+ // addr address where to store the information of the interval
+ // s_left shared memory storage for left interval limits
+ // s_right shared memory storage for right interval limits
+ // s_left_count shared memory storage for number of eigenvalues less than left interval limits
+ // s_right_count shared memory storage for number of eigenvalues less than right interval limits
+ // left lower limit of interval
+ // right upper limit of interval
+ // left_count eigenvalues less than \a left
+ // right_count eigenvalues less than \a right
+ // precision desired precision for eigenvalues
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeInterval(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeInterval(unsigned int addr, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n");
+ source.append(" __local unsigned int * s_left_count, \n");
+ source.append(" __local unsigned int * s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right, \n");
+ source.append(" unsigned int left_count, \n");
+ source.append(" unsigned int right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" s_left_count[addr] = left_count; \n");
+ source.append(" s_right_count[addr] = right_count; \n");
+
+ // check if interval converged
+ source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n");
+ source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n");
+
+ source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n");
+ source.append(" { \n");
+ // compute mid point
+ source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
+
+ // mark as converged
+ source.append(" s_left[addr] = lambda; \n");
+ source.append(" s_right[addr] = lambda; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // store current limits
+ source.append(" s_left[addr] = left; \n");
+ source.append(" s_right[addr] = right; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+
+ }
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeIntervalShort(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeIntervalShort(unsigned int addr, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n");
+ source.append(" __local unsigned short * s_left_count, \n");
+ source.append(" __local unsigned short * s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right, \n");
+ source.append(" unsigned int left_count, \n");
+ source.append(" unsigned int right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" s_left_count[addr] = left_count; \n");
+ source.append(" s_right_count[addr] = right_count; \n");
+
+ // check if interval converged
+ source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n");
+ source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n");
+
+ source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n");
+ source.append(" { \n");
+ // compute mid point
+ source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
+
+ // mark as converged
+ source.append(" s_left[addr] = lambda; \n");
+ source.append(" s_right[addr] = lambda; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // store current limits
+ source.append(" s_left[addr] = left; \n");
+ source.append(" s_right[addr] = right; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute number of eigenvalues that are smaller than x given a symmetric,
+ // real, and tridiagonal matrix
+ //
+ // g_d diagonal elements stored in global memory
+ // g_s superdiagonal elements stored in global memory
+ // n size of matrix
+ // x value for which the number of eigenvalues that are smaller is sought
+ // tid thread identified (e.g. threadIdx.x or gtid)
+ // num_intervals_active number of active intervals / threads that currently process an interval
+ // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory
+ // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
+ // converged flag if the current thread is already converged (that is count does not have to be computed)
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline unsigned int \n");
+ source.append(" computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" const "); source.append(numeric_string); source.append(" x, \n");
+ source.append(" const unsigned int tid, \n");
+ source.append(" const unsigned int num_intervals_active, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n");
+ source.append(" unsigned int converged \n");
+ source.append(" ) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n");
+ source.append(" unsigned int count = 0; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // read data into shared memory
+ source.append(" if (lcl_id < n) \n");
+ source.append(" { \n");
+ source.append(" s_d[lcl_id] = *(g_d + lcl_id); \n");
+ source.append(" s_s[lcl_id] = *(g_s + lcl_id - 1); \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // perform loop only for active threads
+ source.append(" if ((tid < num_intervals_active) && (0 == converged)) \n");
+ source.append(" { \n");
+
+ // perform (optimized) Gaussian elimination to determine the number
+ // of eigenvalues that are smaller than n
+ source.append(" for (unsigned int k = 0; k < n; ++k) \n");
+ source.append(" { \n");
+ source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
+ source.append(" count += (delta < 0) ? 1 : 0; \n");
+ source.append(" } \n");
+
+ source.append(" } \n"); // end if thread currently processing an interval
+
+ source.append(" return count; \n");
+ source.append(" } \n");
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute number of eigenvalues that are smaller than x given a symmetric,
+ // real, and tridiagonal matrix
+ //
+ // g_d diagonal elements stored in global memory
+ // g_s superdiagonal elements stored in global memory
+ // n size of matrix
+ // x value for which the number of eigenvalues that are smaller is seeked
+ // tid thread identified (e.g. threadIdx.x or gtid)
+ // num_intervals_active number of active intervals / threads that currently process an interval
+ // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory
+ // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
+ // converged flag if the current thread is already converged (that is count does not have to be computed)
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline unsigned int \n");
+ source.append(" computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" const "); source.append(numeric_string); source.append(" x, \n");
+ source.append(" const unsigned int tid, \n");
+ source.append(" const unsigned int num_intervals_active, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n");
+ source.append(" unsigned int converged \n");
+ source.append(" ) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n");
+ source.append(" unsigned int count = 0; \n");
+
+ source.append(" unsigned int rem = n; \n");
+
+ // do until whole diagonal and superdiagonal has been loaded and processed
+ source.append(" for (unsigned int i = 0; i < n; i += lcl_sz) \n");
+ source.append(" { \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // read new chunk of data into shared memory
+ source.append(" if ((i + lcl_id) < n) \n");
+ source.append(" { \n");
+
+ source.append(" s_d[lcl_id] = *(g_d + i + lcl_id); \n");
+ source.append(" s_s[lcl_id] = *(g_s + i + lcl_id - 1); \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+
+ source.append(" if (tid < num_intervals_active) \n");
+ source.append(" { \n");
+
+ // perform (optimized) Gaussian elimination to determine the number
+ // of eigenvalues that are smaller than n
+ source.append(" for (unsigned int k = 0; k < min(rem,lcl_sz); ++k) \n");
+ source.append(" { \n");
+ source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
+ // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
+ source.append(" count += (delta < 0) ? 1 : 0; \n");
+ source.append(" } \n");
+
+ source.append(" } \n"); // end if thread currently processing an interval
+
+ source.append(" rem -= lcl_sz; \n");
+ source.append(" } \n");
+
+ source.append(" return count; \n");
+ source.append(" } \n");
+
+
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Store all non-empty intervals resulting from the subdivision of the interval
+ // currently processed by the thread
+ //
+ // addr base address for storing intervals
+ // num_threads_active number of threads / intervals in current sweep
+ // s_left shared memory storage for left interval limits
+ // s_right shared memory storage for right interval limits
+ // s_left_count shared memory storage for number of eigenvalues less than left interval limits
+ // s_right_count shared memory storage for number of eigenvalues less than right interval limits
+ // left lower limit of interval
+ // mid midpoint of interval
+ // right upper limit of interval
+ // left_count eigenvalues less than \a left
+ // mid_count eigenvalues less than \a mid
+ // right_count eigenvalues less than \a right
+ // precision desired precision for eigenvalues
+ // compact_second_chunk shared mem flag if second chunk is used and ergo requires compaction
+ // s_compaction_list_exc helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
+ // is_active_interval mark is thread has a second non-empty child interval
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeNonEmptyIntervals(unsigned int addr, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned int *s_left_count, \n");
+ source.append(" __local unsigned int *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" left, \n ");
+ source.append(" "); source.append(numeric_string); source.append(" mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right,\n");
+ source.append(" const unsigned int left_count, \n");
+ source.append(" const unsigned int mid_count, \n");
+ source.append(" const unsigned int right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision, \n");
+ source.append(" __local unsigned int *compact_second_chunk, \n");
+ source.append(" __local unsigned int *s_compaction_list_exc, \n");
+ source.append(" unsigned int *is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // check if both child intervals are valid
+ source.append(" \n");
+ source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n");
+ source.append(" { \n");
+
+ // store the left interval
+ source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, left_count, mid_count, precision); \n");
+
+ // mark that a second interval has been generated, only stored after
+ // stream compaction of second chunk
+ source.append(" *is_active_second = 1; \n");
+ source.append(" s_compaction_list_exc[lcl_id] = 1; \n");
+ source.append(" *compact_second_chunk = 1; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // only one non-empty child interval
+
+ // mark that no second child
+ source.append(" *is_active_second = 0; \n");
+ source.append(" s_compaction_list_exc[lcl_id] = 0; \n");
+
+ // store the one valid child interval
+ source.append(" if (left_count != mid_count) \n");
+ source.append(" { \n");
+ source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, left_count, mid_count, precision); \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" mid, right, mid_count, right_count, precision); \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append(" } \n");
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //! Store all non-empty intervals resulting from the subdivision of the interval
+ //! currently processed by the thread
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeNonEmptyIntervalsLarge(unsigned int addr, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned short *s_left_count, \n");
+ source.append(" __local unsigned short *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" left, \n ");
+ source.append(" "); source.append(numeric_string); source.append(" mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right,\n");
+ source.append(" const unsigned int left_count, \n");
+ source.append(" const unsigned int mid_count, \n");
+ source.append(" const unsigned int right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" epsilon, \n");
+ source.append(" __local unsigned int *compact_second_chunk, \n");
+ source.append(" __local unsigned short *s_compaction_list, \n");
+ source.append(" unsigned int *is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // check if both child intervals are valid
+ source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n");
+ source.append(" { \n");
+
+ source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, left_count, mid_count, epsilon); \n");
+
+ source.append(" *is_active_second = 1; \n");
+ source.append(" s_compaction_list[lcl_id] = 1; \n");
+ source.append(" *compact_second_chunk = 1; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // only one non-empty child interval
+
+ // mark that no second child
+ source.append(" *is_active_second = 0; \n");
+ source.append(" s_compaction_list[lcl_id] = 0; \n");
+
+ // store the one valid child interval
+ source.append(" if (left_count != mid_count) \n");
+ source.append(" { \n");
+ source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, left_count, mid_count, epsilon); \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" mid, right, mid_count, right_count, epsilon); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Create indices for compaction, that is process \a s_compaction_list_exc
+ // which is 1 for intervals that generated a second child and 0 otherwise
+ // and create for each of the non-zero elements the index where the new
+ // interval belongs to in a compact representation of all generated second children
+ //
+ // s_compaction_list_exc list containing the flags which threads generated two children
+ // num_threads_compaction number of threads to employ for compaction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename StringType>
+ void generate_bisect_kernel_createIndicesCompaction(StringType & source)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" createIndicesCompaction(__local unsigned int *s_compaction_list_exc, \n");
+ source.append(" unsigned int num_threads_compaction) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ source.append(" unsigned int offset = 1; \n");
+ source.append(" const unsigned int tid = lcl_id; \n");
+ // if(tid == 0)
+ // printf("num_threads_compaction = %u\n", num_threads_compaction);
+
+ // higher levels of scan tree
+ source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
+ source.append(" { \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < d) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
+ source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
+ source.append(" \n");
+ source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
+ source.append(" + s_compaction_list_exc[ai]; \n");
+ source.append(" } \n");
+
+ source.append(" offset <<= 1; \n");
+ source.append(" } \n");
+
+ // traverse down tree: first down to level 2 across
+ source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
+ source.append(" { \n");
+
+ source.append(" offset >>= 1; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < (d-1)) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
+ source.append(" unsigned int bi = ai + (offset >> 1); \n");
+
+ source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
+ source.append(" + s_compaction_list_exc[ai]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" } \n");
+ }
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_createIndicesCompactionShort(StringType & source)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc, \n");
+ source.append(" unsigned int num_threads_compaction) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ source.append(" unsigned int offset = 1; \n");
+ source.append(" const unsigned int tid = lcl_id; \n");
+
+ // higher levels of scan tree
+ source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
+ source.append(" { \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < d) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
+ source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
+ source.append(" \n");
+ source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
+ source.append(" + s_compaction_list_exc[ai]; \n");
+ source.append(" } \n");
+
+ source.append(" offset <<= 1; \n");
+ source.append(" } \n");
+
+ // traverse down tree: first down to level 2 across
+ source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
+ source.append(" { \n");
+
+ source.append(" offset >>= 1; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < (d-1)) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
+ source.append(" unsigned int bi = ai + (offset >> 1); \n");
+
+ source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
+ source.append(" + s_compaction_list_exc[ai]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" } \n");
+ }
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Perform stream compaction for second child intervals
+ //
+ // s_left shared memory storage for left interval limits
+ // s_right shared memory storage for right interval limits
+ // s_left_count shared memory storage for number of eigenvalues less than left interval limits
+ // s_right_count shared memory storage for number of eigenvalues less than right interval limits
+ // mid midpoint of current interval (left of new interval)
+ // right upper limit of interval
+ // mid_count eigenvalues less than \a mid
+ // s_compaction_list list containing the indices where the data has to be stored
+ // num_threads_active number of active threads / intervals
+ // is_active_interval mark is thread has a second non-empty child interval
+ ///////////////////////////////////////////////////////////////////////////////
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_compactIntervals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" compactIntervals(__local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned int *s_left_count, \n");
+ source.append(" __local unsigned int *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right, \n");
+ source.append(" unsigned int mid_count, unsigned int right_count, \n");
+ source.append(" __local unsigned int *s_compaction_list, \n");
+ source.append(" unsigned int num_threads_active, \n");
+ source.append(" unsigned int is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+
+ // perform compaction / copy data for all threads where the second
+ // child is not dead
+ source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
+ source.append(" s_left[addr_w] = mid; \n");
+ source.append(" s_right[addr_w] = right; \n");
+ source.append(" s_left_count[addr_w] = mid_count; \n");
+ source.append(" s_right_count[addr_w] = right_count; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_compactIntervalsShort(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" compactIntervalsShort(__local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned short *s_left_count, \n");
+ source.append(" __local unsigned short *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right, \n");
+ source.append(" unsigned int mid_count, unsigned int right_count, \n");
+ source.append(" __local unsigned short *s_compaction_list, \n");
+ source.append(" unsigned int num_threads_active, \n");
+ source.append(" unsigned int is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+
+ // perform compaction / copy data for all threads where the second
+ // child is not dead
+ source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
+ source.append(" s_left[addr_w] = mid; \n");
+ source.append(" s_right[addr_w] = right; \n");
+ source.append(" s_left_count[addr_w] = mid_count; \n");
+ source.append(" s_right_count[addr_w] = right_count; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeIntervalConverged(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeIntervalConverged( __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned int *s_left_count, \n");
+ source.append(" __local unsigned int *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *right, \n");
+ source.append(" unsigned int *left_count, \n");
+ source.append(" unsigned int *mid_count, \n");
+ source.append(" unsigned int *right_count, \n");
+ source.append(" __local unsigned int *s_compaction_list_exc, \n");
+ source.append(" __local unsigned int *compact_second_chunk, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" unsigned int *is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+ source.append(" const unsigned int multiplicity = *right_count - *left_count; \n");
+ // check multiplicity of eigenvalue
+ source.append(" if (1 == multiplicity) \n");
+ source.append(" { \n");
+
+ // just re-store intervals, simple eigenvalue
+ source.append(" s_left[tid] = *left; \n");
+ source.append(" s_right[tid] = *right; \n");
+ source.append(" s_left_count[tid] = *left_count; \n");
+ source.append(" s_right_count[tid] = *right_count; \n");
+ source.append(" \n");
+
+ // mark that no second child / clear
+ source.append(" *is_active_second = 0; \n");
+ source.append(" s_compaction_list_exc[tid] = 0; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // number of eigenvalues after the split less than mid
+ source.append(" *mid_count = *left_count + (multiplicity >> 1); \n");
+
+ // store left interval
+ source.append(" s_left[tid] = *left; \n");
+ source.append(" s_right[tid] = *right; \n");
+ source.append(" s_left_count[tid] = *left_count; \n");
+ source.append(" s_right_count[tid] = *mid_count; \n");
+ source.append(" *mid = *left; \n");
+
+ // mark that second child interval exists
+ source.append(" *is_active_second = 1; \n");
+ source.append(" s_compaction_list_exc[tid] = 1; \n");
+ source.append(" *compact_second_chunk = 1; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+
+
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned short *s_left_count, \n");
+ source.append(" __local unsigned short *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *right, \n");
+ source.append(" unsigned int *left_count, \n");
+ source.append(" unsigned int *mid_count, \n");
+ source.append(" unsigned int *right_count, \n");
+ source.append(" __local unsigned short *s_compaction_list_exc, \n");
+ source.append(" __local unsigned int *compact_second_chunk, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" unsigned int *is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+ source.append(" const unsigned int multiplicity = *right_count - *left_count; \n");
+ // check multiplicity of eigenvalue
+ source.append(" if (1 == multiplicity) \n");
+ source.append(" { \n");
+
+ // just re-store intervals, simple eigenvalue
+ source.append(" s_left[tid] = *left; \n");
+ source.append(" s_right[tid] = *right; \n");
+ source.append(" s_left_count[tid] = *left_count; \n");
+ source.append(" s_right_count[tid] = *right_count; \n");
+ source.append(" \n");
+
+ // mark that no second child / clear
+ source.append(" *is_active_second = 0; \n");
+ source.append(" s_compaction_list_exc[tid] = 0; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // number of eigenvalues after the split less than mid
+ source.append(" *mid_count = *left_count + (multiplicity >> 1); \n");
+
+ // store left interval
+ source.append(" s_left[tid] = *left; \n");
+ source.append(" s_right[tid] = *right; \n");
+ source.append(" s_left_count[tid] = *left_count; \n");
+ source.append(" s_right_count[tid] = *mid_count; \n");
+ source.append(" *mid = *left; \n");
+
+ // mark that second child interval exists
+ source.append(" *is_active_second = 1; \n");
+ source.append(" s_compaction_list_exc[tid] = 1; \n");
+ source.append(" *compact_second_chunk = 1; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Subdivide interval if active and not already converged
+ //
+ // tid id of thread
+ // s_left shared memory storage for left interval limits
+ // s_right shared memory storage for right interval limits
+ // s_left_count shared memory storage for number of eigenvalues less than left interval limits
+ // s_right_count shared memory storage for number of eigenvalues less than right interval limits
+ // num_threads_active number of active threads in warp
+ // left lower limit of interval
+ // right upper limit of interval
+ // left_count eigenvalues less than \a left
+ // right_count eigenvalues less than \a right
+ // all_threads_converged shared memory flag if all threads are converged
+ ///////////////////////////////////////////////////////////////////////////////
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_subdivideActiveInterval(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" subdivideActiveIntervalMulti(const unsigned int tid, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned int *s_left_count, \n");
+ source.append(" __local unsigned int *s_right_count, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *right, \n");
+ source.append(" unsigned int *left_count, unsigned int *right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
+ source.append(" __local unsigned int *all_threads_converged) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // for all active threads
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" *left = s_left[tid]; \n");
+ source.append(" *right = s_right[tid]; \n");
+ source.append(" *left_count = s_left_count[tid]; \n");
+ source.append(" *right_count = s_right_count[tid]; \n");
+
+ // check if thread already converged
+ source.append(" if (*left != *right) \n");
+ source.append(" { \n");
+
+ source.append(" *mid = computeMidpoint(*left, *right); \n");
+ source.append(" *all_threads_converged = 0; \n");
+ source.append(" } \n");
+ source.append(" else if ((*right_count - *left_count) > 1) \n");
+ source.append(" { \n");
+ // mark as not converged if multiple eigenvalues enclosed
+ // duplicate interval in storeIntervalsConverged()
+ source.append(" *all_threads_converged = 0; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ // end for all active threads
+ source.append(" } \n");
+ }
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" subdivideActiveIntervalShort(const unsigned int tid, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned short *s_left_count, \n");
+ source.append(" __local unsigned short *s_right_count, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *right, \n");
+ source.append(" unsigned int *left_count, unsigned int *right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
+ source.append(" __local unsigned int *all_threads_converged) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // for all active threads
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" *left = s_left[tid]; \n");
+ source.append(" *right = s_right[tid]; \n");
+ source.append(" *left_count = s_left_count[tid]; \n");
+ source.append(" *right_count = s_right_count[tid]; \n");
+
+ // check if thread already converged
+ source.append(" if (*left != *right) \n");
+ source.append(" { \n");
+
+ source.append(" *mid = computeMidpoint(*left, *right); \n");
+ source.append(" *all_threads_converged = 0; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ // end for all active threads
+ source.append(" } \n");
+ }
+
+ // end of utilities
+ // start of kernels
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
+ //
+ // g_d diagonal elements in global memory
+ // g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0
+ // n size of matrix
+ // lg lower bound of input interval (e.g. Gerschgorin interval)
+ // ug upper bound of input interval (e.g. Gerschgorin interval)
+ // lg_eig_count number of eigenvalues that are smaller than \a lg
+ // lu_eig_count number of eigenvalues that are smaller than \a lu
+ // epsilon desired accuracy of eigenvalues to compute
+ ////////////////////////////////////////////////////////////////////////////////
+ ///
+ template <typename StringType>
+ void generate_bisect_kernel_bisectKernel(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" __kernel \n");
+ source.append(" void \n");
+ source.append(" bisectKernelSmall(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
+ source.append(" __global unsigned int *g_left_count, __global unsigned int *g_right_count, \n");
+ source.append(" const "); source.append(numeric_string); source.append(" lg, \n");
+ source.append(" const "); source.append(numeric_string); source.append(" ug, \n");
+ source.append(" const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" epsilon \n");
+ source.append(" ) \n");
+ source.append(" { \n");
+ source.append(" g_s = g_s + 1; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // intervals (store left and right because the subdivision tree is in general
+ // not dense
+ source.append(" __local "); source.append(numeric_string); source.append(" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
+
+ // number of eigenvalues that are smaller than s_left / s_right
+ // (correspondence is realized via indices)
+ source.append(" __local unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
+ source.append(" __local unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
+
+ // helper for stream compaction
+ source.append(" __local unsigned int \n");
+ source.append(" s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; \n");
+
+ // state variables for whole block
+ // if 0 then compaction of second chunk of child intervals is not necessary
+ // (because all intervals had exactly one non-dead child)
+ source.append(" __local unsigned int compact_second_chunk; \n");
+ source.append(" __local unsigned int all_threads_converged; \n");
+
+ // number of currently active threads
+ source.append(" __local unsigned int num_threads_active; \n");
+
+ // number of threads to use for stream compaction
+ source.append(" __local unsigned int num_threads_compaction; \n");
+
+ // helper for exclusive scan
+ source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
+
+
+ // variables for currently processed interval
+ // left and right limit of active interval
+ source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
+ source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
+ source.append(" unsigned int left_count = 0; \n");
+ source.append(" unsigned int right_count = 0; \n");
+ // midpoint of active interval
+ source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
+ // number of eigenvalues smaller then mid
+ source.append(" unsigned int mid_count = 0; \n");
+ // affected from compaction
+ source.append(" unsigned int is_active_second = 0; \n");
+
+ source.append(" s_compaction_list[lcl_id] = 0; \n");
+ source.append(" s_left[lcl_id] = 0.0; \n");
+ source.append(" s_right[lcl_id] = 0.0; \n");
+ source.append(" s_left_count[lcl_id] = 0; \n");
+ source.append(" s_right_count[lcl_id] = 0; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // set up initial configuration
+ source.append(" if (0 == lcl_id) \n");
+ source.append(" { \n");
+ source.append(" s_left[0] = lg; \n");
+ source.append(" s_right[0] = ug; \n");
+ source.append(" s_left_count[0] = lg_eig_count; \n");
+ source.append(" s_right_count[0] = ug_eig_count; \n");
+
+ source.append(" compact_second_chunk = 0; \n");
+ source.append(" num_threads_active = 1; \n");
+
+ source.append(" num_threads_compaction = 1; \n");
+ source.append(" } \n");
+
+ // for all active threads read intervals from the last level
+ // the number of (worst case) active threads per level l is 2^l
+
+ source.append(" while (true) \n");
+ source.append(" { \n");
+
+ source.append(" all_threads_converged = 1; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" is_active_second = 0; \n");
+ source.append(" subdivideActiveIntervalMulti(lcl_id, \n");
+ source.append(" s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" num_threads_active, \n");
+ source.append(" &left, &right, &left_count, &right_count, \n");
+ source.append(" &mid, &all_threads_converged); \n");
+ // source.append(" output[lcl_id] = s_left; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // check if done
+ source.append(" if (1 == all_threads_converged) \n");
+ source.append(" { \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // compute number of eigenvalues smaller than mid
+ // use all threads for reading the necessary matrix data from global
+ // memory
+ // use s_left and s_right as scratch space for diagonal and
+ // superdiagonal of matrix
+ source.append(" mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, \n");
+ source.append(" lcl_id, num_threads_active, \n");
+ source.append(" s_left, s_right, \n");
+ source.append(" (left == right)); \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // store intervals
+ // for all threads store the first child interval in a continuous chunk of
+ // memory, and the second child interval -- if it exists -- in a second
+ // chunk; it is likely that all threads reach convergence up to
+ // \a epsilon at the same level; furthermore, for higher level most / all
+ // threads will have only one child, storing the first child compactly will
+ // (first) avoid to perform a compaction step on the first chunk, (second)
+ // make it for higher levels (when all threads / intervals have
+ // exactly one child) unnecessary to perform a compaction of the second
+ // chunk
+ source.append(" if (lcl_id < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" if (left != right) \n");
+ source.append(" { \n");
+
+ // store intervals
+ source.append(" storeNonEmptyIntervals(lcl_id, num_threads_active, \n");
+ source.append(" s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, right, \n");
+ source.append(" left_count, mid_count, right_count, \n");
+ source.append(" epsilon, &compact_second_chunk, \n");
+ source.append(" s_compaction_list_exc, \n");
+ source.append(" &is_active_second); \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" &left, &mid, &right, \n");
+ source.append(" &left_count, &mid_count, &right_count, \n");
+ source.append(" s_compaction_list_exc, &compact_second_chunk, \n");
+ source.append(" num_threads_active, \n");
+ source.append(" &is_active_second); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ // necessary so that compact_second_chunk is up-to-date
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // perform compaction of chunk where second children are stored
+ // scan of (num_threads_actieigenvaluesve / 2) elements, thus at most
+ // (num_threads_active / 4) threads are needed
+ source.append(" if (compact_second_chunk > 0) \n");
+ source.append(" { \n");
+
+ source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
+
+ source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" mid, right, mid_count, right_count, \n");
+ source.append(" s_compaction_list, num_threads_active, \n");
+ source.append(" is_active_second); \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (0 == lcl_id) \n");
+ source.append(" { \n");
+
+ // update number of active threads with result of reduction
+ source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
+
+ source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
+
+ source.append(" compact_second_chunk = 0; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // write resulting intervals to global mem
+ // for all threads write if they have been converged to an eigenvalue to
+ // a separate array
+
+ // at most n valid intervals
+ source.append(" if (lcl_id < n) \n");
+ source.append(" { \n");
+ // intervals converged so left and right limit are identical
+ source.append(" g_left[lcl_id] = s_left[lcl_id]; \n");
+ // left count is sufficient to have global order
+ source.append(" g_left_count[lcl_id] = s_left_count[lcl_id]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Perform second step of bisection algorithm for large matrices for intervals that after the first step contained more than one eigenvalue
+ //
+ // g_d diagonal elements of symmetric, tridiagonal matrix
+ // g_s superdiagonal elements of symmetric, tridiagonal matrix
+ // n matrix size
+ // blocks_mult start addresses of blocks of intervals that are processed by one block of threads, each of the intervals contains more than one eigenvalue
+ // blocks_mult_sum total number of eigenvalues / singleton intervals in one block of intervals
+ // g_left left limits of intervals
+ // g_right right limits of intervals
+ // g_left_count number of eigenvalues less than left limits
+ // g_right_count number of eigenvalues less than right limits
+ // g_lambda final eigenvalue
+ // g_pos index of eigenvalue (in ascending order)
+ // precision desired precision of eigenvalues
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" __kernel \n");
+ source.append(" void \n");
+ source.append(" bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" __global unsigned int *blocks_mult, \n");
+ source.append(" __global unsigned int *blocks_mult_sum, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
+ source.append(" __global unsigned int *g_left_count, \n");
+ source.append(" __global unsigned int *g_right_count, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_lambda, \n");
+ source.append(" __global unsigned int *g_pos, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision \n");
+ source.append(" ) \n");
+ source.append(" { \n");
+ source.append(" g_s = g_s + 1; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+
+ // left and right limits of interval
+ source.append(" __local "); source.append(numeric_string); source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
+
+ // number of eigenvalues smaller than interval limits
+ source.append(" __local unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
+ source.append(" __local unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
+
+ // helper array for chunk compaction of second chunk
+ source.append(" __local unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
+ // compaction list helper for exclusive scan
+ source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
+
+ // flag if all threads are converged
+ source.append(" __local unsigned int all_threads_converged; \n");
+ // number of active threads
+ source.append(" __local unsigned int num_threads_active; \n");
+ // number of threads to employ for compaction
+ source.append(" __local unsigned int num_threads_compaction; \n");
+ // flag if second chunk has to be compacted
+ source.append(" __local unsigned int compact_second_chunk; \n");
+
+ // parameters of block of intervals processed by this block of threads
+ source.append(" __local unsigned int c_block_start; \n");
+ source.append(" __local unsigned int c_block_end; \n");
+ source.append(" __local unsigned int c_block_offset_output; \n");
+
+ // midpoint of currently active interval of the thread
+ source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
+ // number of eigenvalues smaller than \a mid
+ source.append(" unsigned int mid_count = 0; \n");
+ // current interval parameter
+ source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
+ source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
+ source.append(" unsigned int left_count = 0; \n");
+ source.append(" unsigned int right_count = 0; \n");
+ // helper for compaction, keep track which threads have a second child
+ source.append(" unsigned int is_active_second = 0; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // initialize common start conditions
+ source.append(" if (0 == tid) \n");
+ source.append(" { \n");
+
+ source.append(" c_block_start = blocks_mult[grp_id]; \n");
+ source.append(" c_block_end = blocks_mult[grp_id + 1]; \n");
+ source.append(" c_block_offset_output = blocks_mult_sum[grp_id]; \n");
+ source.append(" \n");
+
+ source.append(" num_threads_active = c_block_end - c_block_start; \n");
+ source.append(" s_compaction_list[0] = 0; \n");
+ source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
+
+ source.append(" all_threads_converged = 1; \n");
+ source.append(" compact_second_chunk = 0; \n");
+ source.append(" } \n");
+ source.append(" s_left_count [tid] = 42; \n");
+ source.append(" s_right_count[tid] = 42; \n");
+ source.append(" s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
+ source.append(" s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
+ source.append(" \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+ source.append(" \n");
+
+ // read data into shared memory
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" s_left[tid] = g_left[c_block_start + tid]; \n");
+ source.append(" s_right[tid] = g_right[c_block_start + tid]; \n");
+ source.append(" s_left_count[tid] = g_left_count[c_block_start + tid]; \n");
+ source.append(" s_right_count[tid] = g_right_count[c_block_start + tid]; \n");
+ source.append(" \n");
+ source.append(" } \n");
+ source.append(" \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+ source.append(" unsigned int iter = 0; \n");
+ // do until all threads converged
+ source.append(" while (true) \n");
+ source.append(" { \n");
+ source.append(" iter++; \n");
+ //for (int iter=0; iter < 0; iter++) {
+ source.append(" s_compaction_list[lcl_id] = 0; \n");
+ source.append(" s_compaction_list[lcl_id + lcl_sz] = 0; \n");
+ source.append(" s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
+
+ // subdivide interval if currently active and not already converged
+ source.append(" subdivideActiveIntervalMulti(tid, s_left, s_right, \n");
+ source.append(" s_left_count, s_right_count, \n");
+ source.append(" num_threads_active, \n");
+ source.append(" &left, &right, &left_count, &right_count, \n");
+ source.append(" &mid, &all_threads_converged); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // stop if all eigenvalues have been found
+ source.append(" if (1 == all_threads_converged) \n");
+ source.append(" { \n");
+ source.append(" \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+
+ // compute number of eigenvalues smaller than mid for active and not
+ // converged intervals, use all threads for loading data from gmem and
+ // s_left and s_right as scratch space to store the data load from gmem
+ // in shared memory
+ source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
+ source.append(" mid, tid, num_threads_active, \n");
+ source.append(" s_left, s_right, \n");
+ source.append(" (left == right)); \n");
+ source.append(" \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+ source.append(" \n");
+ // store intervals
+ source.append(" if (left != right) \n");
+ source.append(" { \n");
+
+ source.append(" storeNonEmptyIntervals(tid, num_threads_active, \n");
+ source.append(" s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, right, \n");
+ source.append(" left_count, mid_count, right_count, \n");
+ source.append(" precision, &compact_second_chunk, \n");
+ source.append(" s_compaction_list_exc, \n");
+ source.append(" &is_active_second); \n");
+ source.append(" \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" &left, &mid, &right, \n");
+ source.append(" &left_count, &mid_count, &right_count, \n");
+ source.append(" s_compaction_list_exc, &compact_second_chunk, \n");
+ source.append(" num_threads_active, \n");
+ source.append(" &is_active_second); \n");
+ source.append(" \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // compact second chunk of intervals if any of the threads generated
+ // two child intervals
+ source.append(" if (1 == compact_second_chunk) \n");
+ source.append(" { \n");
+
+ source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
+ source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" mid, right, mid_count, right_count, \n");
+ source.append(" s_compaction_list, num_threads_active, \n");
+ source.append(" is_active_second); \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // update state variables
+ source.append(" if (0 == tid) \n");
+ source.append(" { \n");
+ source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
+ source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
+
+ source.append(" compact_second_chunk = 0; \n");
+ source.append(" all_threads_converged = 1; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // clear
+ source.append(" s_compaction_list_exc[lcl_id] = 0; \n");
+ source.append(" s_compaction_list_exc[lcl_id + lcl_sz] = 0; \n");
+ source.append(" \n");
+ source.append(" if (num_threads_compaction > lcl_sz) \n");
+ source.append(" { \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" } \n"); // end until all threads converged
+
+ // write data back to global memory
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int addr = c_block_offset_output + tid; \n");
+ source.append(" \n");
+ source.append(" g_lambda[addr] = s_left[tid]; \n");
+ source.append(" g_pos[addr] = s_right_count[tid]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Determine eigenvalues for large matrices for intervals that after the first step contained one eigenvalue
+ //
+ // g_d diagonal elements of symmetric, tridiagonal matrix
+ // g_s superdiagonal elements of symmetric, tridiagonal matrix
+ // n matrix size
+ // num_intervals total number of intervals containing one eigenvalue after the first step
+ // g_left left interval limits
+ // g_right right interval limits
+ // g_pos index of interval / number of intervals that are smaller than right interval limit
+ // precision desired precision of eigenvalues
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" __kernel \n");
+ source.append(" void \n");
+ source.append(" bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" unsigned int num_intervals, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
+ source.append(" __global unsigned int *g_pos, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision) \n");
+ source.append(" { \n");
+ source.append(" g_s = g_s + 1; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+ source.append(" const unsigned int gtid = (lcl_sz * grp_id) + lcl_id; \n");
+ source.append(" __loca
<TRUNCATED>
[38/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
new file mode 100755
index 0000000..960f5c2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
@@ -0,0 +1,180 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large_onei.hpp
+ @brief Determine eigenvalues for large matrices for intervals that contained after the first step one eigenvalue
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Determine eigenvalues for large matrices for intervals that after
+//! the first step contained one eigenvalue
+//! @param g_d diagonal elements of symmetric, tridiagonal matrix
+//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix
+//! @param n matrix size
+//! @param num_intervals total number of intervals containing one eigenvalue
+//! after the first step
+//! @param g_left left interval limits
+//! @param g_right right interval limits
+//! @param g_pos index of interval / number of intervals that are smaller than
+//! right interval limit
+//! @param precision desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge_OneIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+ unsigned int num_intervals,
+ NumericT *g_left, NumericT *g_right,
+ unsigned int *g_pos,
+ NumericT precision)
+{
+
+ const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+ __shared__ NumericT s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];
+ __shared__ NumericT s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+ // active interval of thread
+ // left and right limit of current interval
+ NumericT left, right;
+ // number of threads smaller than the right limit (also corresponds to the
+ // global index of the eigenvalues contained in the active interval)
+ unsigned int right_count;
+ // flag if current thread converged
+ unsigned int converged = 0;
+ // midpoint when current interval is subdivided
+ NumericT mid = 0.0f;
+ // number of eigenvalues less than mid
+ unsigned int mid_count = 0;
+
+ // read data from global memory
+ if (gtid < num_intervals)
+ {
+ left = g_left[gtid];
+ right = g_right[gtid];
+ right_count = g_pos[gtid];
+ }
+
+
+ // flag to determine if all threads converged to eigenvalue
+ __shared__ unsigned int converged_all_threads;
+
+ // initialized shared flag
+ if (0 == threadIdx.x)
+ {
+ converged_all_threads = 0;
+ }
+
+ __syncthreads();
+
+ // process until all threads converged to an eigenvalue
+ while (true)
+ {
+
+ converged_all_threads = 1;
+
+ // update midpoint for all active threads
+ if ((gtid < num_intervals) && (0 == converged))
+ {
+ mid = computeMidpoint(left, right);
+ }
+
+ // find number of eigenvalues that are smaller than midpoint
+ mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+ mid, gtid, num_intervals,
+ s_left_scratch,
+ s_right_scratch,
+ converged);
+
+ __syncthreads();
+
+ // for all active threads
+ if ((gtid < num_intervals) && (0 == converged))
+ {
+
+ // update intervals -- always one child interval survives
+ if (right_count == mid_count)
+ {
+ right = mid;
+ }
+ else
+ {
+ left = mid;
+ }
+
+ // check for convergence
+ NumericT t0 = right - left;
+ NumericT t1 = max(abs(right), abs(left)) * precision;
+
+ if (t0 < min(precision, t1))
+ {
+ NumericT lambda = computeMidpoint(left, right);
+ left = lambda;
+ right = lambda;
+
+ converged = 1;
+ }
+ else
+ {
+ converged_all_threads = 0;
+ }
+ }
+
+ __syncthreads();
+
+ if (1 == converged_all_threads)
+ {
+ break;
+ }
+
+ __syncthreads();
+ }
+
+ // write data back to global memory
+ __syncthreads();
+
+ if (gtid < num_intervals)
+ {
+ // intervals converged so left and right interval limit are both identical
+ // and identical to the eigenvalue
+ g_left[gtid] = left;
+ }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
new file mode 100755
index 0000000..310b381
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_small.hpp
@@ -0,0 +1,261 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_SMALL_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp
+ @brief Determine eigenvalues for small symmetric, tridiagonal matrix
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
+*
+* @param g_d diagonal elements in global memory
+* @param g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0
+* @param n size of matrix
+* @param g_left helper array
+* @param g_right helper array
+* @param g_left_count helper array
+* @param g_right_count helper array
+* @param lg lower bound of input interval (e.g. Gerschgorin interval)
+* @param ug upper bound of input interval (e.g. Gerschgorin interval)
+* @param lg_eig_count number of eigenvalues that are smaller than lg
+* @param ug_eig_count number of eigenvalues that are smaller than lu
+* @param epsilon desired accuracy of eigenvalues to compute
+*/
+template<typename NumericT>
+__global__
+void
+bisectKernelSmall(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+ NumericT * g_left, NumericT *g_right,
+ unsigned int *g_left_count, unsigned int *g_right_count,
+ const NumericT lg, const NumericT ug,
+ const unsigned int lg_eig_count, const unsigned int ug_eig_count,
+ NumericT epsilon
+ )
+{
+ // intervals (store left and right because the subdivision tree is in general
+ // not dense
+ __shared__ NumericT s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+ __shared__ NumericT s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+
+ // number of eigenvalues that are smaller than s_left / s_right
+ // (correspondence is realized via indices)
+ __shared__ unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+ __shared__ unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];
+
+ // helper for stream compaction
+ __shared__ unsigned int
+ s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1];
+
+ // state variables for whole block
+ // if 0 then compaction of second chunk of child intervals is not necessary
+ // (because all intervals had exactly one non-dead child)
+ __shared__ unsigned int compact_second_chunk;
+ __shared__ unsigned int all_threads_converged;
+
+ // number of currently active threads
+ __shared__ unsigned int num_threads_active;
+
+ // number of threads to use for stream compaction
+ __shared__ unsigned int num_threads_compaction;
+
+ // helper for exclusive scan
+ unsigned int *s_compaction_list_exc = s_compaction_list + 1;
+
+
+ // variables for currently processed interval
+ // left and right limit of active interval
+ NumericT left = 0.0f;
+ NumericT right = 0.0f;
+ unsigned int left_count = 0;
+ unsigned int right_count = 0;
+ // midpoint of active interval
+ NumericT mid = 0.0f;
+ // number of eigenvalues smaller then mid
+ unsigned int mid_count = 0;
+ // affected from compaction
+ unsigned int is_active_second = 0;
+
+ s_compaction_list[threadIdx.x] = 0;
+ s_left[threadIdx.x] = 0;
+ s_right[threadIdx.x] = 0;
+ s_left_count[threadIdx.x] = 0;
+ s_right_count[threadIdx.x] = 0;
+
+ __syncthreads();
+
+ // set up initial configuration
+ if (0 == threadIdx.x)
+ {
+ s_left[0] = lg;
+ s_right[0] = ug;
+ s_left_count[0] = lg_eig_count;
+ s_right_count[0] = ug_eig_count;
+
+ compact_second_chunk = 0;
+ num_threads_active = 1;
+
+ num_threads_compaction = 1;
+ }
+
+ // for all active threads read intervals from the last level
+ // the number of (worst case) active threads per level l is 2^l
+ while (true)
+ {
+
+ all_threads_converged = 1;
+ __syncthreads();
+
+ is_active_second = 0;
+ subdivideActiveIntervalMulti(threadIdx.x,
+ s_left, s_right, s_left_count, s_right_count,
+ num_threads_active,
+ left, right, left_count, right_count,
+ mid, all_threads_converged);
+
+ __syncthreads();
+
+ // check if done
+ if (1 == all_threads_converged)
+ {
+ break;
+ }
+
+ __syncthreads();
+
+ // compute number of eigenvalues smaller than mid
+ // use all threads for reading the necessary matrix data from global
+ // memory
+ // use s_left and s_right as scratch space for diagonal and
+ // superdiagonal of matrix
+ mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,
+ threadIdx.x, num_threads_active,
+ s_left, s_right,
+ (left == right));
+
+ __syncthreads();
+
+ // store intervals
+ // for all threads store the first child interval in a continuous chunk of
+ // memory, and the second child interval -- if it exists -- in a second
+ // chunk; it is likely that all threads reach convergence up to
+ // \a epsilon at the same level; furthermore, for higher level most / all
+ // threads will have only one child, storing the first child compactly will
+ // (first) avoid to perform a compaction step on the first chunk, (second)
+ // make it for higher levels (when all threads / intervals have
+ // exactly one child) unnecessary to perform a compaction of the second
+ // chunk
+ if (threadIdx.x < num_threads_active)
+ {
+
+ if (left != right)
+ {
+
+ // store intervals
+ storeNonEmptyIntervals(threadIdx.x, num_threads_active,
+ s_left, s_right, s_left_count, s_right_count,
+ left, mid, right,
+ left_count, mid_count, right_count,
+ epsilon, compact_second_chunk,
+ s_compaction_list_exc,
+ is_active_second);
+ }
+ else
+ {
+
+ storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,
+ left, mid, right,
+ left_count, mid_count, right_count,
+ s_compaction_list_exc, compact_second_chunk,
+ num_threads_active,
+ is_active_second);
+ }
+ }
+
+ // necessary so that compact_second_chunk is up-to-date
+ __syncthreads();
+
+ // perform compaction of chunk where second children are stored
+ // scan of (num_threads_active / 2) elements, thus at most
+ // (num_threads_active / 4) threads are needed
+ if (compact_second_chunk > 0)
+ {
+
+ createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);
+
+ compactIntervals(s_left, s_right, s_left_count, s_right_count,
+ mid, right, mid_count, right_count,
+ s_compaction_list, num_threads_active,
+ is_active_second);
+ }
+
+ __syncthreads();
+
+ if (0 == threadIdx.x)
+ {
+
+ // update number of active threads with result of reduction
+ num_threads_active += s_compaction_list[num_threads_active];
+
+ num_threads_compaction = ceilPow2(num_threads_active);
+
+ compact_second_chunk = 0;
+ }
+
+ __syncthreads();
+
+ }
+
+ __syncthreads();
+
+ // write resulting intervals to global mem
+ // for all threads write if they have been converged to an eigenvalue to
+ // a separate array
+
+ // at most n valid intervals
+ if (threadIdx.x < n)
+ {
+
+ // intervals converged so left and right limit are identical
+ g_left[threadIdx.x] = s_left[threadIdx.x];
+ // left count is sufficient to have global order
+ g_left_count[threadIdx.x] = s_left_count[threadIdx.x];
+ }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+#endif // #ifndef _BISECT_KERNEL_SMALL_H_
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
new file mode 100755
index 0000000..e2e262c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_util.hpp
@@ -0,0 +1,613 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_BISECT_UTIL_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_small.hpp
+ @brief Utility / shared functionality for bisection kernels
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Compute the next lower power of two of n
+//! @param n number for which next higher power of two is seeked
+////////////////////////////////////////////////////////////////////////////////
+__device__
+inline int
+floorPow2(int n)
+{
+
+ // early out if already power of two
+ if (0 == (n & (n-1)))
+ {
+ return n;
+ }
+
+ int exp;
+ frexp((float)n, &exp);
+ return (1 << (exp - 1));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute the next higher power of two of n
+//! @param n number for which next higher power of two is seeked
+////////////////////////////////////////////////////////////////////////////////
+__device__
+inline int
+ceilPow2(int n)
+{
+
+ // early out if already power of two
+ if (0 == (n & (n-1)))
+ {
+ return n;
+ }
+
+ int exp;
+ frexp((float)n, &exp);
+ return (1 << exp);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute midpoint of interval [\a left, \a right] avoiding overflow if
+//! possible
+//! @param left left / lower limit of interval
+//! @param right right / upper limit of interval
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline NumericT
+computeMidpoint(const NumericT left, const NumericT right)
+{
+
+ NumericT mid;
+
+ if (viennacl::linalg::detail::sign_f(left) == viennacl::linalg::detail::sign_f(right))
+ {
+ mid = left + (right - left) * 0.5f;
+ }
+ else
+ {
+ mid = (left + right) * 0.5f;
+ }
+
+ return mid;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Check if interval converged and store appropriately
+//! @param addr address where to store the information of the interval
+//! @param s_left shared memory storage for left interval limits
+//! @param s_right shared memory storage for right interval limits
+//! @param s_left_count shared memory storage for number of eigenvalues less
+//! than left interval limits
+//! @param s_right_count shared memory storage for number of eigenvalues less
+//! than right interval limits
+//! @param left lower limit of interval
+//! @param right upper limit of interval
+//! @param left_count eigenvalues less than \a left
+//! @param right_count eigenvalues less than \a right
+//! @param precision desired precision for eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<class S, class T, class NumericT>
+__device__
+void
+storeInterval(unsigned int addr,
+ NumericT *s_left, NumericT *s_right,
+ T *s_left_count, T *s_right_count,
+ NumericT left, NumericT right,
+ S left_count, S right_count,
+ NumericT precision)
+{
+ s_left_count[addr] = left_count;
+ s_right_count[addr] = right_count;
+
+ // check if interval converged
+ NumericT t0 = abs(right - left);
+ NumericT t1 = max(abs(left), abs(right)) * precision;
+
+ if (t0 <= max(static_cast<NumericT>(VIENNACL_BISECT_MIN_ABS_INTERVAL), t1))
+ {
+ // compute mid point
+ NumericT lambda = computeMidpoint(left, right);
+
+ // mark as converged
+ s_left[addr] = lambda;
+ s_right[addr] = lambda;
+ }
+ else
+ {
+
+ // store current limits
+ s_left[addr] = left;
+ s_right[addr] = right;
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute number of eigenvalues that are smaller than x given a symmetric,
+//! real, and tridiagonal matrix
+//! @param g_d diagonal elements stored in global memory
+//! @param g_s superdiagonal elements stored in global memory
+//! @param n size of matrix
+//! @param x value for which the number of eigenvalues that are smaller is
+//! seeked
+//! @param tid thread identified (e.g. threadIdx.x or gtid)
+//! @param num_intervals_active number of active intervals / threads that
+//! currently process an interval
+//! @param s_d scratch space to store diagonal entries of the tridiagonal
+//! matrix in shared memory
+//! @param s_s scratch space to store superdiagonal entries of the tridiagonal
+//! matrix in shared memory
+//! @param converged flag if the current thread is already converged (that
+//! is count does not have to be computed)
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline unsigned int
+computeNumSmallerEigenvals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+ const NumericT x,
+ const unsigned int tid,
+ const unsigned int num_intervals_active,
+ NumericT *s_d, NumericT *s_s,
+ unsigned int converged
+ )
+{
+
+ NumericT delta = 1.0f;
+ unsigned int count = 0;
+
+ __syncthreads();
+
+ // read data into shared memory
+ if (threadIdx.x < n)
+ {
+ s_d[threadIdx.x] = *(g_d + threadIdx.x);
+ s_s[threadIdx.x] = *(g_s + threadIdx.x - 1);
+ }
+
+ __syncthreads();
+
+ // perform loop only for active threads
+ if ((tid < num_intervals_active) && (0 == converged))
+ {
+
+ // perform (optimized) Gaussian elimination to determine the number
+ // of eigenvalues that are smaller than n
+ for (unsigned int k = 0; k < n; ++k)
+ {
+ delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
+ count += (delta < 0) ? 1 : 0;
+ }
+
+ } // end if thread currently processing an interval
+
+ return count;
+}
+////////////////////////////////////////////////////////////////////////////////
+//! Compute number of eigenvalues that are smaller than x given a symmetric,
+//! real, and tridiagonal matrix
+//! @param g_d diagonal elements stored in global memory
+//! @param g_s superdiagonal elements stored in global memory
+//! @param n size of matrix
+//! @param x value for which the number of eigenvalues that are smaller is
+//! seeked
+//! @param tid thread identified (e.g. threadIdx.x or gtid)
+//! @param num_intervals_active number of active intervals / threads that
+//! currently process an interval
+//! @param s_d scratch space to store diagonal entries of the tridiagonal
+//! matrix in shared memory
+//! @param s_s scratch space to store superdiagonal entries of the tridiagonal
+//! matrix in shared memory
+//! @param converged flag if the current thread is already converged (that
+//! is count does not have to be computed)
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+inline unsigned int
+computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n,
+ const NumericT x,
+ const unsigned int tid,
+ const unsigned int num_intervals_active,
+ NumericT *s_d, NumericT *s_s,
+ unsigned int converged
+ )
+{
+ NumericT delta = 1.0f;
+ unsigned int count = 0;
+
+ unsigned int rem = n;
+
+ // do until whole diagonal and superdiagonal has been loaded and processed
+ for (unsigned int i = 0; i < n; i += blockDim.x)
+ {
+
+ __syncthreads();
+
+ // read new chunk of data into shared memory
+ if ((i + threadIdx.x) < n)
+ {
+
+ s_d[threadIdx.x] = *(g_d + i + threadIdx.x);
+ s_s[threadIdx.x] = *(g_s + i + threadIdx.x - 1);
+ }
+
+ __syncthreads();
+
+
+ if (tid < num_intervals_active)
+ {
+
+ // perform (optimized) Gaussian elimination to determine the number
+ // of eigenvalues that are smaller than n
+ for (unsigned int k = 0; k < min(rem,blockDim.x); ++k)
+ {
+ delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;
+ // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
+ count += (delta < 0) ? 1 : 0;
+ }
+
+ } // end if thread currently processing an interval
+
+ rem -= blockDim.x;
+ }
+
+ return count;
+}
+
+/** @brief Store all non-empty intervals resulting from the subdivision of the interval currently processed by the thread.
+*
+* @param addr base address for storing intervals
+* @param num_threads_active number of threads / intervals in current sweep
+* @param s_left shared memory storage for left interval limits
+* @param s_right shared memory storage for right interval limits
+* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits
+* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits
+* @param left lower limit of interval
+* @param mid midpoint of interval
+* @param right upper limit of interval
+* @param left_count eigenvalues less than \a left
+* @param mid_count eigenvalues less than \a mid
+* @param right_count eigenvalues less than \a right
+* @param precision desired precision for eigenvalues
+* @param compact_second_chunk shared mem flag if second chunk is used and ergo requires compaction
+* @param s_compaction_list_exc helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
+* @param is_active_second mark is thread has a second non-empty child interval
+*/
+template<class S, class T, class NumericT>
+__device__
+void
+storeNonEmptyIntervals(unsigned int addr,
+ const unsigned int num_threads_active,
+ NumericT *s_left, NumericT *s_right,
+ T *s_left_count, T *s_right_count,
+ NumericT left, NumericT mid, NumericT right,
+ const S left_count,
+ const S mid_count,
+ const S right_count,
+ NumericT precision,
+ unsigned int &compact_second_chunk,
+ T *s_compaction_list_exc,
+ unsigned int &is_active_second)
+{
+ // check if both child intervals are valid
+
+ if ((left_count != mid_count) && (mid_count != right_count))
+ {
+
+ // store the left interval
+ storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+ left, mid, left_count, mid_count, precision);
+
+ // mark that a second interval has been generated, only stored after
+ // stream compaction of second chunk
+ is_active_second = 1;
+ s_compaction_list_exc[threadIdx.x] = 1;
+ compact_second_chunk = 1;
+ }
+ else
+ {
+
+ // only one non-empty child interval
+
+ // mark that no second child
+ is_active_second = 0;
+ s_compaction_list_exc[threadIdx.x] = 0;
+
+ // store the one valid child interval
+ if (left_count != mid_count)
+ {
+ storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+ left, mid, left_count, mid_count, precision);
+ }
+ else
+ {
+ storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+ mid, right, mid_count, right_count, precision);
+ }
+
+ }
+}
+////////////////////////////////////////////////////////////////////////////////
+//! Create indices for compaction, that is process \a s_compaction_list_exc
+//! which is 1 for intervals that generated a second child and 0 otherwise
+//! and create for each of the non-zero elements the index where the new
+//! interval belongs to in a compact representation of all generated second
+//! childs
+//! @param s_compaction_list_exc list containing the flags which threads
+//! generated two childs
+//! @param num_threads_compaction number of threads to employ for compaction
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+__device__
+void
+createIndicesCompaction(T *s_compaction_list_exc,
+ unsigned int num_threads_compaction)
+{
+
+ unsigned int offset = 1;
+ const unsigned int tid = threadIdx.x;
+ // if(tid == 0)
+ // printf("num_threads_compaction = %u\n", num_threads_compaction);
+
+ // higher levels of scan tree
+ for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+ {
+
+ __syncthreads();
+
+ if (tid < d)
+ {
+
+ unsigned int ai = offset*(2*tid+1)-1;
+ unsigned int bi = offset*(2*tid+2)-1;
+
+ s_compaction_list_exc[bi] = s_compaction_list_exc[bi]
+ + s_compaction_list_exc[ai];
+ }
+
+ offset <<= 1;
+ }
+
+ // traverse down tree: first down to level 2 across
+ for (int d = 2; d < num_threads_compaction; d <<= 1)
+ {
+
+ offset >>= 1;
+ __syncthreads();
+
+ if (tid < (d-1))
+ {
+
+ unsigned int ai = offset*(tid+1) - 1;
+ unsigned int bi = ai + (offset >> 1);
+
+ s_compaction_list_exc[bi] = s_compaction_list_exc[bi]
+ + s_compaction_list_exc[ai];
+ }
+ }
+
+ __syncthreads();
+
+}
+
+/** @brief Perform stream compaction for second child intervals
+*
+* @param s_left shared memory storage for left interval limits
+* @param s_right shared memory storage for right interval limits
+* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits
+* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits
+* @param mid midpoint of current interval (left of new interval)
+* @param right upper limit of interval
+* @param mid_count eigenvalues less than mid
+* @param right_count eigenvalues less than right
+* @param s_compaction_list list containing the indices where the data has to be stored
+* @param num_threads_active number of active threads / intervals
+* @param is_active_second mark is thread has a second non-empty child interval
+*/
+template<class T, class NumericT>
+__device__
+void
+compactIntervals(NumericT *s_left, NumericT *s_right,
+ T *s_left_count, T *s_right_count,
+ NumericT mid, NumericT right,
+ unsigned int mid_count, unsigned int right_count,
+ T *s_compaction_list,
+ unsigned int num_threads_active,
+ unsigned int is_active_second)
+{
+ const unsigned int tid = threadIdx.x;
+
+ // perform compaction / copy data for all threads where the second
+ // child is not dead
+ if ((tid < num_threads_active) && (1 == is_active_second))
+ {
+ unsigned int addr_w = num_threads_active + s_compaction_list[tid];
+ s_left[addr_w] = mid;
+ s_right[addr_w] = right;
+ s_left_count[addr_w] = mid_count;
+ s_right_count[addr_w] = right_count;
+ }
+}
+
+template<class T, class S, class NumericT>
+__device__
+void
+storeIntervalConverged(NumericT *s_left, NumericT *s_right,
+ T *s_left_count, T *s_right_count,
+ NumericT &left, NumericT &mid, NumericT &right,
+ S &left_count, S &mid_count, S &right_count,
+ T *s_compaction_list_exc,
+ unsigned int &compact_second_chunk,
+ const unsigned int num_threads_active,
+ unsigned int &is_active_second)
+{
+ const unsigned int tid = threadIdx.x;
+ const unsigned int multiplicity = right_count - left_count;
+ // check multiplicity of eigenvalue
+ if (1 == multiplicity)
+ {
+
+ // just re-store intervals, simple eigenvalue
+ s_left[tid] = left;
+ s_right[tid] = right;
+ s_left_count[tid] = left_count;
+ s_right_count[tid] = right_count;
+
+
+ // mark that no second child / clear
+ is_active_second = 0;
+ s_compaction_list_exc[tid] = 0;
+ }
+ else
+ {
+
+ // number of eigenvalues after the split less than mid
+ mid_count = left_count + (multiplicity >> 1);
+
+ // store left interval
+ s_left[tid] = left;
+ s_right[tid] = right;
+ s_left_count[tid] = left_count;
+ s_right_count[tid] = mid_count;
+ mid = left;
+
+ // mark that second child interval exists
+ is_active_second = 1;
+ s_compaction_list_exc[tid] = 1;
+ compact_second_chunk = 1;
+ }
+}
+
+/** @brief Subdivide interval if active and not already converged.
+*
+* @param tid id of thread
+* @param s_left shared memory storage for left interval limits
+* @param s_right shared memory storage for right interval limits
+* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits
+* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits
+* @param num_threads_active number of active threads in warp
+* @param left lower limit of interval
+* @param right upper limit of interval
+* @param left_count eigenvalues less than \a left
+* @param right_count eigenvalues less than \a right
+* @param mid median of interval
+* @param all_threads_converged shared memory flag if all threads are
+*/
+template<class T, class NumericT>
+__device__
+void
+subdivideActiveIntervalMulti(const unsigned int tid,
+ NumericT *s_left, NumericT *s_right,
+ T *s_left_count, T *s_right_count,
+ const unsigned int num_threads_active,
+ NumericT &left, NumericT &right,
+ unsigned int &left_count, unsigned int &right_count,
+ NumericT &mid, unsigned int &all_threads_converged)
+{
+ // for all active threads
+ if (tid < num_threads_active)
+ {
+
+ left = s_left[tid];
+ right = s_right[tid];
+ left_count = s_left_count[tid];
+ right_count = s_right_count[tid];
+
+ // check if thread already converged
+ if (left != right)
+ {
+
+ mid = computeMidpoint(left, right);
+ all_threads_converged = 0;
+ }
+ else if ((right_count - left_count) > 1)
+ {
+ // mark as not converged if multiple eigenvalues enclosed
+ // duplicate interval in storeIntervalsConverged()
+ all_threads_converged = 0;
+ }
+
+ } // end for all active threads
+}
+
+
+/** @brief Subdivide interval if active and not already converged.
+*
+* @param tid id of thread
+* @param s_left shared memory storage for left interval limits
+* @param s_right shared memory storage for right interval limits
+* @param s_left_count shared memory storage for number of eigenvalues less than left interval limits
+* @param s_right_count shared memory storage for number of eigenvalues less than right interval limits
+* @param num_threads_active number of active threads in warp
+* @param left lower limit of interval
+* @param right upper limit of interval
+* @param left_count eigenvalues less than \a left
+* @param right_count eigenvalues less than \a right
+* @param mid median of interval
+* @param all_threads_converged shared memory flag if all threads are
+*/
+template<class T, class NumericT>
+__device__
+void
+subdivideActiveInterval(const unsigned int tid,
+ NumericT *s_left, NumericT *s_right,
+ T *s_left_count, T *s_right_count,
+ const unsigned int num_threads_active,
+ NumericT &left, NumericT &right,
+ unsigned int &left_count, unsigned int &right_count,
+ NumericT &mid, unsigned int &all_threads_converged)
+{
+ // for all active threads
+ if (tid < num_threads_active)
+ {
+
+ left = s_left[tid];
+ right = s_right[tid];
+ left_count = s_left_count[tid];
+ right_count = s_right_count[tid];
+
+ // check if thread already converged
+ if (left != right)
+ {
+
+ mid = computeMidpoint(left, right);
+ all_threads_converged = 0;
+ }
+ } // end for all active threads
+}
+}
+}
+}
+
+#endif // #ifndef VIENNACL_LINALG_DETAIL_BISECT_UTIL_HPP_
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
new file mode 100644
index 0000000..3622b89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/common.hpp
@@ -0,0 +1,250 @@
+#ifndef VIENNACL_LINALG_CUDA_COMMON_HPP_
+#define VIENNACL_LINALG_CUDA_COMMON_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/common.hpp
+ @brief Common routines for CUDA execution
+*/
+
+#include <sstream>
+#include <cuda_runtime.h>
+#include "viennacl/backend/cuda.hpp"
+#include "viennacl/traits/handle.hpp"
+
+#define VIENNACL_CUDA_LAST_ERROR_CHECK(message) detail::cuda_last_error_check (message, __FILE__, __LINE__)
+
+namespace viennacl
+{
+
+////// scalar
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(scalar<NumericT> & obj)
+{
+ return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL scalar. Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(scalar<NumericT> const & obj)
+{
+ return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+////// vector_base
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base) with implicit return type deduction. Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(vector_base<NumericT> & obj)
+{
+ return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base) with implicit return type deduction. Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(vector_base<NumericT> const & obj)
+{
+ return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base). Return type needs to be explicitly provided as first template argument. Non-const version. */
+template<typename ReturnT, typename NumericT>
+ReturnT * cuda_arg(vector_base<NumericT> & obj)
+{
+ return reinterpret_cast<ReturnT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL vector (through the base class vector_base). Return type needs to be explicitly provided as first template argument. Const version. */
+template<typename ReturnT, typename NumericT>
+const ReturnT * cuda_arg(vector_base<NumericT> const & obj)
+{
+ return reinterpret_cast<const ReturnT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+////// matrix_base
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL matrix (through the base class matrix_base). Non-const version. */
+template<typename NumericT>
+NumericT * cuda_arg(matrix_base<NumericT> & obj)
+{
+ return reinterpret_cast<NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a ViennaCL matrix (through the base class matrix_base). Const version. */
+template<typename NumericT>
+const NumericT * cuda_arg(matrix_base<NumericT> const & obj)
+{
+ return reinterpret_cast<const NumericT *>(viennacl::traits::handle(obj).cuda_handle().get());
+}
+
+
+
+////// mem_handle
+
+
+/** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Non-const version. */
+template<typename ReturnT>
+ReturnT * cuda_arg(viennacl::backend::mem_handle & h)
+{
+ return reinterpret_cast<ReturnT *>(h.cuda_handle().get());
+}
+
+/** @brief Convenience helper function for extracting the CUDA handle from a generic memory handle. Const-version. */
+template<typename ReturnT>
+ReturnT const * cuda_arg(viennacl::backend::mem_handle const & h)
+{
+ return reinterpret_cast<const ReturnT *>(h.cuda_handle().get());
+}
+
+/** \cond **/
+template<typename ReturnT>
+ReturnT * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type & h)
+{
+ return reinterpret_cast<ReturnT *>(h.get());
+}
+
+template<typename ReturnT>
+ReturnT const * cuda_arg(viennacl::backend::mem_handle::cuda_handle_type const & h)
+{
+ return reinterpret_cast<const ReturnT *>(h.get());
+}
+
+inline unsigned int cuda_arg(unsigned int val) { return val; }
+
+template<typename NumericT> char cuda_arg(char val) { return val; }
+template<typename NumericT> unsigned char cuda_arg(unsigned char val) { return val; }
+
+template<typename NumericT> short cuda_arg(short val) { return val; }
+template<typename NumericT> unsigned short cuda_arg(unsigned short val) { return val; }
+
+template<typename NumericT> int cuda_arg(int val) { return val; }
+template<typename NumericT> unsigned int cuda_arg(unsigned int val) { return val; }
+
+template<typename NumericT> long cuda_arg(long val) { return val; }
+template<typename NumericT> unsigned long cuda_arg(unsigned long val) { return val; }
+
+template<typename NumericT> float cuda_arg(float val) { return val; }
+template<typename NumericT> double cuda_arg(double val) { return val; }
+
+/** \endcond */
+
+
+namespace linalg
+{
+namespace cuda
+{
+
+
+namespace detail
+{
+
+inline unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
+{
+ return static_cast<unsigned int>( ((length > 1) ? (static_cast<unsigned int>(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
+}
+
+inline void cuda_last_error_check(const char * message, const char * file, const int line )
+{
+ cudaError_t error_code = cudaGetLastError();
+
+ if (cudaSuccess != error_code)
+ {
+ std::stringstream ss;
+ ss << file << "(" << line << "): " << ": getLastCudaError() CUDA error " << error_code << ": " << cudaGetErrorString( error_code ) << " @ " << message << std::endl;
+ throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code);
+ }
+}
+
+template<typename NumericT>
+struct type_to_type2;
+
+template<>
+struct type_to_type2<float> { typedef float2 type; };
+
+template<>
+struct type_to_type2<double> { typedef double2 type; };
+
+
+template<typename NumericT, typename OtherT>
+typename viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar<NumericT> & s, OtherT) { return s.handle().cuda_handle(); }
+
+template<typename NumericT, typename OtherT>
+typename viennacl::backend::mem_handle::cuda_handle_type const & arg_reference(viennacl::scalar<NumericT> const & s, OtherT) { return s.handle().cuda_handle(); }
+
+// all other cases where T is not a ViennaCL scalar
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ char const &>::type
+arg_reference(ArgT, char const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ unsigned char const &>::type
+arg_reference(ArgT, unsigned char const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ short const &>::type
+arg_reference(ArgT, short const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ unsigned short const &>::type
+arg_reference(ArgT, unsigned short const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ int const &>::type
+arg_reference(ArgT, int const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ unsigned int const &>::type
+arg_reference(ArgT, unsigned int const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ long const &>::type
+arg_reference(ArgT, long const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ unsigned long const &>::type
+arg_reference(ArgT, unsigned long const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ float const &>::type
+arg_reference(ArgT, float const & val) { return val; }
+
+template<typename ArgT>
+typename viennacl::enable_if< viennacl::is_cpu_scalar<ArgT>::value,
+ double const &>::type
+arg_reference(ArgT, double const & val) { return val; }
+
+} //namespace detail
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
new file mode 100644
index 0000000..ae70f9a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/direct_solve.hpp
@@ -0,0 +1,412 @@
+#ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/direct_solve.hpp
+ @brief Implementations of dense direct solvers using CUDA are found here.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename NumericT>
+__global__ void matrix_matrix_upper_solve_kernel(
+ const NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ bool row_major_A,
+
+ NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_size1, unsigned int B_size2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+ bool row_major_B,
+
+ bool unit_diagonal)
+{
+ NumericT temp;
+ NumericT entry_A;
+
+ for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
+ {
+ unsigned int row = A_size1 - 1 - row_cnt;
+
+ if (!unit_diagonal)
+ {
+ __syncthreads();
+
+ if (threadIdx.x == 0)
+ {
+ if (row_major_B)
+ B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+ : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+ else //if (!row_major_B)
+ B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+ : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+ }
+ }
+
+ __syncthreads();
+
+ if (row_major_B)
+ temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
+ else //if (!row_major_B)
+ temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
+
+ //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
+ for (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
+ {
+ if (row_major_A)
+ entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+ else //if (!row_major_A)
+ entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+
+ if (row_major_B)
+ B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
+ else //if (!row_major_B)
+ B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
+
+ }
+ }
+}
+
+
+
+template<typename NumericT>
+__global__ void matrix_matrix_lower_solve_kernel(
+ const NumericT * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ bool row_major_A,
+
+ NumericT * B,
+ unsigned int B_start1, unsigned int B_start2,
+ unsigned int B_inc1, unsigned int B_inc2,
+ unsigned int B_size1, unsigned int B_size2,
+ unsigned int B_internal_size1, unsigned int B_internal_size2,
+ bool row_major_B,
+
+ bool unit_diagonal)
+{
+ NumericT temp;
+ NumericT entry_A;
+
+ for (unsigned int row = 0; row < A_size1; ++row)
+ {
+
+ if (!unit_diagonal)
+ {
+ __syncthreads();
+
+ if (threadIdx.x == 0)
+ {
+ if (row_major_B)
+ B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+ : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+ else //if (!row_major_B)
+ B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
+ : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
+ }
+ }
+
+ __syncthreads();
+
+ if (row_major_B)
+ temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
+ else //if (!row_major_B)
+ temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
+
+ //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
+ for (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
+ {
+ if (row_major_A)
+ entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+ else //if (!row_major_A)
+ entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+
+ if (row_major_B)
+ B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
+ else //if (!row_major_B)
+ B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
+
+ }
+ }
+}
+
+
+
+
+
+
+namespace detail
+{
+ template<typename TagT>
+ bool is_unit_solve(TagT const & tag) { return false; }
+
+ inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; }
+ inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; }
+
+ template<typename TagT>
+ bool is_upper_solve(TagT const & tag) { return false; }
+
+ inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; }
+ inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; }
+
+ template<typename Matrix1T, typename Matrix2T, typename SolverTagT>
+ void inplace_solve_impl(Matrix1T const & A,
+ Matrix2T & B,
+ SolverTagT const & tag)
+ {
+ typedef typename viennacl::result_of::cpu_value_type<Matrix1T>::type value_type;
+
+ dim3 threads(128);
+ dim3 grid(B.size2());
+
+ if (is_upper_solve(tag))
+ {
+ matrix_matrix_upper_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+ bool(A.row_major()),
+
+ viennacl::cuda_arg(B),
+ static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
+ static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
+ static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+ bool(B.row_major()),
+
+ is_unit_solve(tag)
+ );
+ }
+ else
+ {
+ matrix_matrix_lower_solve_kernel<<<grid,threads>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+ bool(A.row_major()),
+
+ viennacl::cuda_arg(B),
+ static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)),
+ static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)),
+ static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
+ bool(B.row_major()),
+
+ is_unit_solve(tag)
+ );
+ }
+
+ }
+}
+
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+//
+
+////////////////// triangular solver //////////////////////////////////////
+/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation).
+*
+* @param A The system matrix
+* @param B The matrix of row vectors, where the solution is directly written to
+* @param tag Solver tag for identifying the respective triangular solver
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+ matrix_base<NumericT> & B,
+ SolverTagT tag)
+{
+ detail::inplace_solve_impl(A, B, tag);
+}
+
+
+//
+// Solve on vector
+//
+
+template<typename NumericT>
+__global__ void triangular_substitute_inplace_row_kernel(
+ NumericT const * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ NumericT * v,
+ unsigned int v_start,
+ unsigned int v_inc,
+ unsigned int v_size,
+
+ unsigned int options)
+{
+ NumericT temp;
+ unsigned int unit_diagonal_flag = (options & (1 << 0));
+
+ unsigned int is_lower_solve = (options & (1 << 2));
+ unsigned int row;
+ for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square
+ {
+ row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
+ if (!unit_diagonal_flag)
+ {
+ __syncthreads();
+ if (threadIdx.x == 0)
+ v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+ }
+
+ __syncthreads();
+
+ temp = v[row * v_inc + v_start];
+
+ for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
+ elim < (is_lower_solve ? A_size1 : row);
+ elim += blockDim.x)
+ v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
+ }
+}
+
+
+template<typename NumericT>
+__global__ void triangular_substitute_inplace_col_kernel(
+ NumericT const * A,
+ unsigned int A_start1, unsigned int A_start2,
+ unsigned int A_inc1, unsigned int A_inc2,
+ unsigned int A_size1, unsigned int A_size2,
+ unsigned int A_internal_size1, unsigned int A_internal_size2,
+ NumericT * v,
+ unsigned int v_start,
+ unsigned int v_inc,
+ unsigned int v_size,
+ unsigned int options)
+{
+ NumericT temp;
+ unsigned int unit_diagonal_flag = (options & (1 << 0));
+
+ unsigned int is_lower_solve = (options & (1 << 2));
+ unsigned int row;
+ for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square
+ {
+ row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
+ if (!unit_diagonal_flag)
+ {
+ __syncthreads();
+ if (threadIdx.x == 0)
+ v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+ }
+
+ __syncthreads();
+
+ temp = v[row * v_inc + v_start];
+
+ for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
+ elim < (is_lower_solve ? A_size1 : row);
+ elim += blockDim.x)
+ v[elim * v_inc + v_start] -= temp * A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
+ }
+}
+
+
+namespace detail
+{
+ inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; }
+ inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
+ inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); }
+ inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
+
+ template<typename MatrixT, typename VectorT>
+ void inplace_solve_vector_impl(MatrixT const & mat,
+ VectorT & vec,
+ unsigned int options)
+ {
+ typedef typename viennacl::result_of::cpu_value_type<MatrixT>::type value_type;
+
+ if (mat.row_major())
+ {
+ triangular_substitute_inplace_row_kernel<<<1, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)),
+ static_cast<unsigned int>(viennacl::traits::size(vec)),
+ options
+ );
+ }
+ else
+ {
+ triangular_substitute_inplace_col_kernel<<<1, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)),
+ static_cast<unsigned int>(viennacl::traits::size(vec)),
+ options
+ );
+ }
+ }
+
+}
+
+/** @brief Direct inplace solver for dense triangular systems (non-transposed version)
+*
+* @param mat The system matrix proxy
+* @param vec The load vector, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & mat,
+ vector_base<NumericT> & vec,
+ SolverTagT)
+{
+ unsigned int options = detail::get_option_for_solver_tag(SolverTagT());
+
+ detail::inplace_solve_vector_impl(mat, vec, options);
+}
+
+
+}
+}
+}
+
+#endif
[37/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
Posted by ap...@apache.org.
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
new file mode 100644
index 0000000..198ac31
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp
@@ -0,0 +1,858 @@
+#ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation using cuda
+*/
+#include <cmath>
+#include <viennacl/matrix.hpp>
+#include <viennacl/vector.hpp>
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+#include "viennacl/linalg/host_based/vector_operations.hpp"
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace detail
+{
+ namespace fft
+ {
+ const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+ inline vcl_size_t num_bits(vcl_size_t size)
+ {
+ vcl_size_t bits_datasize = 0;
+ vcl_size_t ds = 1;
+
+ while (ds < size)
+ {
+ ds = ds << 1;
+ bits_datasize++;
+ }
+
+ return bits_datasize;
+ }
+
+ inline vcl_size_t next_power_2(vcl_size_t n)
+ {
+ n = n - 1;
+
+ vcl_size_t power = 1;
+
+ while (power < sizeof(vcl_size_t) * 8)
+ {
+ n = n | (n >> power);
+ power *= 2;
+ }
+
+ return n + 1;
+ }
+
+ } //namespace fft
+} //namespace detail
+
+// addition
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+ return make_float2(a.x + b.x, a.y + b.y);
+}
+
+// subtract
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+ return make_float2(a.x - b.x, a.y - b.y);
+}
+// division
+template<typename SCALARTYPE>
+inline __device__ float2 operator/(float2 a,SCALARTYPE b)
+{
+ return make_float2(a.x/b, a.y/b);
+}
+
+//multiplication
+inline __device__ float2 operator*(float2 in1, float2 in2)
+{
+ return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
+}
+
+// addition
+inline __host__ __device__ double2 operator+(double2 a, double2 b)
+{
+ return make_double2(a.x + b.x, a.y + b.y);
+}
+
+// subtraction
+inline __host__ __device__ double2 operator-(double2 a, double2 b)
+{
+ return make_double2(a.x - b.x, a.y - b.y);
+}
+
+// division
+template<typename SCALARTYPE>
+inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
+{
+ return make_double2(a.x/b, a.y/b);
+}
+
+//multiplication
+inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
+{
+ return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
+}
+
+inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
+{
+ v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+ v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+ v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+ v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+ v = (v >> 16) | (v << 16);
+ v = v >> (32 - bit_size);
+ return v;
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_direct(
+ const Numeric2T * input,
+ Numeric2T * output,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int batch_num,
+ NumericT sign,
+ bool is_row_major)
+{
+
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+ {
+ for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
+ {
+ Numeric2T f;
+ f.x = 0;
+ f.y = 0;
+
+ for (unsigned int n = 0; n < size; n++)
+ {
+ Numeric2T in;
+ if (!is_row_major)
+ in = input[batch_id * stride + n]; //input index here
+ else
+ in = input[n * stride + batch_id];//input index here
+
+ NumericT sn,cs;
+ NumericT arg = sign * 2 * NUM_PI * k / size * n;
+ sn = sin(arg);
+ cs = cos(arg);
+
+ Numeric2T ex;
+ ex.x = cs;
+ ex.y = sn;
+ Numeric2T tmp;
+ tmp.x = in.x * ex.x - in.y * ex.y;
+ tmp.y = in.x * ex.y + in.y * ex.x;
+ f = f + tmp;
+ }
+
+ if (!is_row_major)
+ output[batch_id * stride + k] = f; // output index here
+ else
+ output[k * stride + batch_id] = f;// output index here
+ }
+ }
+}
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+ viennacl::vector<NumericT, AlignmentV> & out,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
+ NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+ reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & out,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
+ NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+ reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
+}
+
+template<typename NumericT>
+__global__ void fft_reorder(NumericT * input,
+ unsigned int bit_size,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int batch_num,
+ bool is_row_major)
+{
+
+ unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+ {
+ for (unsigned int i = glb_id; i < size; i += glb_sz)
+ {
+ unsigned int v = get_reorder_num(i, bit_size);
+
+ if (i < v)
+ {
+ if (!is_row_major)
+ {
+ NumericT tmp = input[batch_id * stride + i]; // index
+ input[batch_id * stride + i] = input[batch_id * stride + v];//index
+ input[batch_id * stride + v] = tmp;//index
+ }
+ else
+ {
+ NumericT tmp = input[i * stride + batch_id];
+ input[i * stride + batch_id] = input[v * stride + batch_id];
+ input[v * stride + batch_id] = tmp;
+ }
+ }
+ }
+ }
+}
+
+/***
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV> & in,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bits_datasize),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_radix2_local(Numeric2T * input,
+ unsigned int bit_size,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int batch_num,
+ NumericT sign,
+ bool is_row_major)
+{
+ __shared__ Numeric2T lcl_input[1024];
+ unsigned int grp_id = blockIdx.x;
+ unsigned int grp_num = gridDim.x;
+
+ unsigned int lcl_sz = blockDim.x;
+ unsigned int lcl_id = threadIdx.x;
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
+ {
+ for (unsigned int p = lcl_id; p < size; p += lcl_sz)
+ {
+ unsigned int v = get_reorder_num(p, bit_size);
+ if (!is_row_major)
+ lcl_input[v] = input[batch_id * stride + p];
+ else
+ lcl_input[v] = input[p * stride + batch_id];
+ }
+
+ __syncthreads();
+
+ //performs Cooley-Tukey FFT on local arrayfft
+ for (unsigned int s = 0; s < bit_size; s++)
+ {
+ unsigned int ss = 1 << s;
+ NumericT cs, sn;
+ for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
+ {
+ unsigned int group = (tid & (ss - 1));
+ unsigned int pos = ((tid >> s) << (s + 1)) + group;
+
+ Numeric2T in1 = lcl_input[pos];
+ Numeric2T in2 = lcl_input[pos + ss];
+
+ NumericT arg = group * sign * NUM_PI / ss;
+
+ sn = sin(arg);
+ cs = cos(arg);
+ Numeric2T ex;
+ ex.x = cs;
+ ex.y = sn;
+
+ Numeric2T tmp;
+ tmp.x = in2.x * ex.x - in2.y * ex.y;
+ tmp.y = in2.x * ex.y + in2.y * ex.x;
+
+ lcl_input[pos + ss] = in1 - tmp;
+ lcl_input[pos] = in1 + tmp;
+ }
+ __syncthreads();
+ }
+
+ //copy local array back to global memory
+ for (unsigned int p = lcl_id; p < size; p += lcl_sz)
+ {
+ if (!is_row_major)
+ input[batch_id * stride + p] = lcl_input[p]; //index
+ else
+ input[p * stride + batch_id] = lcl_input[p];
+ }
+
+ }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_radix2(Numeric2T * input,
+ unsigned int s,
+ unsigned int bit_size,
+ unsigned int size,
+ unsigned int stride,
+ unsigned int batch_num,
+ NumericT sign,
+ bool is_row_major)
+{
+
+ unsigned int ss = 1 << s;
+ unsigned int half_size = size >> 1;
+
+ NumericT cs, sn;
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
+ {
+ for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
+ {
+ unsigned int group = (tid & (ss - 1));
+ unsigned int pos = ((tid >> s) << (s + 1)) + group;
+ Numeric2T in1;
+ Numeric2T in2;
+ unsigned int offset;
+ if (!is_row_major)
+ {
+ offset = batch_id * stride + pos;
+ in1 = input[offset]; //index
+ in2 = input[offset + ss];//index
+ }
+ else
+ {
+ offset = pos * stride + batch_id;
+ in1 = input[offset]; //index
+ in2 = input[offset + ss * stride];//index
+ }
+
+ NumericT arg = group * sign * NUM_PI / ss;
+
+ sn = sin(arg);
+ cs = cos(arg);
+
+ Numeric2T ex;
+ ex.x = cs;
+ ex.y = sn;
+
+ Numeric2T tmp;
+ tmp.x = in2.x * ex.x - in2.y * ex.y;
+ tmp.y = in2.x * ex.y + in2.y * ex.x;
+
+ if (!is_row_major)
+ input[offset + ss] = in1 - tmp; //index
+ else
+ input[offset + ss * stride] = in1 - tmp; //index
+ input[offset] = in1 + tmp; //index
+ }
+ }
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV> & in,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
+
+ if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
+ {
+ fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ static_cast<NumericT>(sign),
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
+ }
+ else
+ {
+ fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+
+ for (vcl_size_t step = 0; step < bit_size; step++)
+ {
+ fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(step),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
+ }
+ }
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
+
+ if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM)
+ {
+ fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
+ }
+ else
+ {
+ fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
+ for (vcl_size_t step = 0; step < bit_size; step++)
+ {
+ fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ static_cast<unsigned int>(step),
+ static_cast<unsigned int>(bit_size),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(stride),
+ static_cast<unsigned int>(batch_num),
+ sign,
+ static_cast<bool>(data_order));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
+ }
+ }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign)
+{
+ unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int glb_sz =gridDim.x * blockDim.x;
+
+ unsigned int double_size = size << 1;
+ NumericT sn_a, cs_a;
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ for (unsigned int i = glb_id; i < size; i += glb_sz)
+ {
+ unsigned int rm = i * i % (double_size);
+ NumericT angle = (NumericT)rm / size * (-NUM_PI);
+
+ sn_a = sin(angle);
+ cs_a= cos(angle);
+
+ Numeric2T b_i;
+ b_i.x = cs_a;
+ b_i.y = sn_a;
+ out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
+ out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
+ }
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
+ unsigned int size, unsigned int ext_size, NumericT sign)
+{
+ unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int glb_sz = gridDim.x * blockDim.x;
+
+ unsigned int double_size = size << 1;
+
+ NumericT sn_a, cs_a;
+ const NumericT NUM_PI(3.14159265358979323846);
+
+ for (unsigned int i = glb_id; i < size; i += glb_sz)
+ {
+ unsigned int rm = i * i % (double_size);
+ NumericT angle = (NumericT)rm / size * NUM_PI;
+
+ sn_a = sin(-angle);
+ cs_a= cos(-angle);
+
+ Numeric2T a_i;
+ a_i.x =cs_a;
+ a_i.y =sn_a;
+
+ Numeric2T b_i;
+ b_i.x =cs_a;
+ b_i.y =-sn_a;
+
+ A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
+ A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
+ B[i] = b_i;
+
+ // very bad instruction, to be fixed
+ if (i)
+ B[ext_size - i] = b_i;
+ }
+}
+
+template<typename NumericT>
+__global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ input1[i].x = 0;
+ input1[i].y = 0;
+
+ input2[i].x = 0;
+ input2[i].y = 0;
+ }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently, Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
+ viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ vcl_size_t size = in.size() >> 1;
+ vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * size - 1);
+
+ viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+ zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
+ static_cast<unsigned int>(ext_size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("zero2");
+
+ bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)),
+ static_cast<unsigned int>(size),
+ static_cast<unsigned int>(ext_size),
+ NumericT(1));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
+
+ viennacl::linalg::convolve_i(A, B, Z);
+
+ bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
+ static_cast<unsigned int>(size),
+ NumericT(1));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
+}
+
+template<typename NumericT>
+__global__ void fft_mult_vec(const NumericT * input1,
+ const NumericT * input2,
+ NumericT * output,
+ unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ NumericT in1 = input1[i];
+ NumericT in2 = input2[i];
+ output[i] = in1 * in2;
+ }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+ viennacl::vector<NumericT, AlignmentV> const & input2,
+ viennacl::vector<NumericT, AlignmentV> & output)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ vcl_size_t size = input1.size() / 2;
+
+ fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)),
+ reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)),
+ reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)),
+ static_cast<unsigned int>(size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
+}
+
+template<typename Numeric2T, typename NumericT>
+__global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
+ input1[i] = input1[i]/factor;
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ vcl_size_t size = input.size() >> 1;
+ NumericT norm_factor = static_cast<NumericT>(size);
+ fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
+ static_cast<unsigned int>(size),
+ norm_factor);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
+}
+
+template<typename NumericT>
+__global__ void transpose(const NumericT * input,
+ NumericT * output,
+ unsigned int row_num,
+ unsigned int col_num)
+{
+ unsigned int size = row_num * col_num;
+ for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
+ {
+ unsigned int row = i / col_num;
+ unsigned int col = i - row*col_num;
+ unsigned int new_pos = col * row_num + row;
+ output[new_pos] = input[i];
+ }
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)),
+ reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)),
+ static_cast<unsigned int>(input.internal_size1()>>1),
+ static_cast<unsigned int>(input.internal_size2()>>1));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
+
+}
+
+template<typename NumericT>
+__global__ void transpose_inplace(
+ NumericT * input,
+ unsigned int row_num,
+ unsigned int col_num)
+{
+ unsigned int size = row_num * col_num;
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
+ {
+ unsigned int row = i / col_num;
+ unsigned int col = i - row*col_num;
+ unsigned int new_pos = col * row_num + row;
+ if (i < new_pos)
+ {
+ NumericT val = input[i];
+ input[i] = input[new_pos];
+ input[new_pos] = val;
+ }
+ }
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)),
+ static_cast<unsigned int>(input.internal_size1()>>1),
+ static_cast<unsigned int>(input.internal_size2() >> 1));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
+
+}
+
+template<typename RealT,typename ComplexT>
+__global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ {
+ ComplexT val;
+ val.x = in[i];
+ val.y = 0;
+ out[i] = val;
+ }
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ real_to_complex<<<128,128>>>(viennacl::cuda_arg(in),
+ reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)),
+ static_cast<unsigned int>(size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
+}
+
+template<typename ComplexT,typename RealT>
+__global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
+ out[i] = in[i].x;
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT>& out, vcl_size_t size)
+{
+ typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type;
+
+ complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)),
+ viennacl::cuda_arg(out),
+ static_cast<unsigned int>(size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
+
+}
+
+template<typename NumericT>
+__global__ void reverse_inplace(NumericT * vec, unsigned int size)
+{
+ for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
+ {
+ NumericT val1 = vec[i];
+ NumericT val2 = vec[size - i - 1];
+ vec[i] = val2;
+ vec[size - i - 1] = val1;
+ }
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT>& in)
+{
+ vcl_size_t size = in.size();
+ reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
+}
+
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
new file mode 100644
index 0000000..302a73c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp
@@ -0,0 +1,666 @@
+#ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/ilu_operations.hpp
+ @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using CUDA
+*/
+
+#include <cmath>
+#include <algorithm> //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+#include "viennacl/traits/stride.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename IndexT> // to control external linkage
+__global__ void extract_L_kernel_1(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ unsigned int A_size1,
+ unsigned int * L_row_indices)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ unsigned int num_entries_L = 0;
+ for (unsigned int j=row_begin; j<row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ if (col <= row)
+ ++num_entries_L;
+ }
+
+ L_row_indices[row] = num_entries_L;
+ }
+}
+
+template<typename NumericT>
+__global__ void extract_L_kernel_2(
+ unsigned int const *A_row_indices,
+ unsigned int const *A_col_indices,
+ NumericT const *A_elements,
+ unsigned int A_size1,
+
+ unsigned int const *L_row_indices,
+ unsigned int *L_col_indices,
+ NumericT *L_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ unsigned int index_L = L_row_indices[row];
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ NumericT value = A_elements[j];
+
+ if (col <= row)
+ {
+ L_col_indices[index_L] = col;
+ L_elements[index_L] = value;
+ ++index_L;
+ }
+ }
+ }
+}
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ //
+ // Step 1: Count elements in L and U:
+ //
+ extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(L.handle1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_1");
+
+ //
+ // Step 2: Exclusive scan on row_buffers:
+ //
+ viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1().cuda_handle()), viennacl::CUDA_MEMORY, A.size1() + 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+ //
+ // Step 3: Write entries
+ //
+ extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_2");
+
+ L.generate_row_block_information();
+
+} // extract_L
+
+///////////////////////////////////////////////
+
+
+template<typename NumericT>
+__global__ void ilu_scale_kernel_1(
+ unsigned int const *A_row_indices,
+ unsigned int const *A_col_indices,
+ NumericT const *A_elements,
+ unsigned int A_size1,
+
+ NumericT *D_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ if (row == col)
+ {
+ D_elements[row] = NumericT(1) / sqrt(fabs(A_elements[j]));
+ break;
+ }
+ }
+ }
+}
+
+/** @brief Scales values in a matrix such that output = D * input * D, where D is a diagonal matrix (only the diagonal is provided) */
+template<typename NumericT>
+__global__ void ilu_scale_kernel_2(
+ unsigned int const *R_row_indices,
+ unsigned int const *R_col_indices,
+ NumericT *R_elements,
+ unsigned int R_size1,
+
+ NumericT *D_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < R_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = R_row_indices[row];
+ unsigned int row_end = R_row_indices[row+1];
+
+ NumericT D_row = D_elements[row];
+
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ R_elements[j] *= D_row * D_elements[R_col_indices[j]];
+ }
+}
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ // fill D:
+ ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg(D)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+
+ // scale L:
+ ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ static_cast<unsigned int>(L.size1()),
+ viennacl::cuda_arg(D)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+}
+
+/////////////////////////////////////
+
+/** @brief CUDA kernel for one Chow-Patel-ICC sweep */
+template<typename NumericT>
+__global__ void icc_chow_patel_sweep_kernel(
+ unsigned int const *L_row_indices,
+ unsigned int const *L_col_indices,
+ NumericT *L_elements,
+ NumericT const *L_backup,
+ unsigned int L_size1,
+ NumericT const *aij_L)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < L_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ //
+ // update L:
+ //
+ unsigned int row_Li_start = L_row_indices[row];
+ unsigned int row_Li_end = L_row_indices[row + 1];
+
+ for (unsigned int i = row_Li_start; i < row_Li_end; ++i)
+ {
+ unsigned int col = L_col_indices[i];
+
+ unsigned int row_Lj_start = L_row_indices[col];
+ unsigned int row_Lj_end = L_row_indices[col + 1];
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ unsigned int index_Lj = row_Lj_start;
+ unsigned int col_Lj = L_col_indices[index_Lj];
+ NumericT s = aij_L[i];
+ for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li)
+ {
+ unsigned int col_Li = L_col_indices[index_Li];
+
+ // find element in U
+ while (col_Lj < col_Li)
+ {
+ ++index_Lj;
+ col_Lj = L_col_indices[index_Lj];
+ }
+
+ if (col_Lj == col_Li)
+ s -= L_backup[index_Li] * L_backup[index_Lj];
+ }
+
+ // update l_ij:
+ L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); // diagonal element is last entry in U
+ }
+
+ }
+}
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L)
+{
+ viennacl::backend::mem_handle L_backup;
+ viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+ viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+ icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ viennacl::cuda_arg<NumericT>(L_backup),
+ static_cast<unsigned int>(L.size1()),
+
+ viennacl::cuda_arg<NumericT>(aij_L.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("icc_chow_patel_sweep_kernel");
+
+}
+
+
+////////////////////////////// ILU ///////////////////////////
+
+template<typename IndexT> // to control external linkage
+__global__ void extract_LU_kernel_1(
+ const IndexT * A_row_indices,
+ const IndexT * A_col_indices,
+ unsigned int A_size1,
+
+ unsigned int * L_row_indices,
+
+ unsigned int * U_row_indices)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ unsigned int num_entries_L = 0;
+ unsigned int num_entries_U = 0;
+ for (unsigned int j=row_begin; j<row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ if (col <= row)
+ ++num_entries_L;
+ if (col >= row)
+ ++num_entries_U;
+ }
+
+ L_row_indices[row] = num_entries_L;
+ U_row_indices[row] = num_entries_U;
+ }
+}
+
+template<typename NumericT>
+__global__ void extract_LU_kernel_2(
+ unsigned int const *A_row_indices,
+ unsigned int const *A_col_indices,
+ NumericT const *A_elements,
+ unsigned int A_size1,
+
+ unsigned int const *L_row_indices,
+ unsigned int *L_col_indices,
+ NumericT *L_elements,
+
+ unsigned int const *U_row_indices,
+ unsigned int *U_col_indices,
+ NumericT *U_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < A_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = A_row_indices[row];
+ unsigned int row_end = A_row_indices[row+1];
+
+ unsigned int index_L = L_row_indices[row];
+ unsigned int index_U = U_row_indices[row];
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col = A_col_indices[j];
+ NumericT value = A_elements[j];
+
+ if (col <= row)
+ {
+ L_col_indices[index_L] = col;
+ L_elements[index_L] = value;
+ ++index_L;
+ }
+
+ if (col >= row)
+ {
+ U_col_indices[index_U] = col;
+ U_elements[index_U] = value;
+ ++index_U;
+ }
+ }
+ }
+}
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ //
+ // Step 1: Count elements in L and U:
+ //
+ extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(U.handle1())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_1");
+
+ //
+ // Step 2: Exclusive scan on row_buffers:
+ //
+ viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+ viennacl::vector<unsigned int> wrapped_U_row_buffer(viennacl::cuda_arg<unsigned int>(U.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1);
+ viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer);
+ U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+ //
+ // Step 3: Write entries
+ //
+ extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ viennacl::cuda_arg<unsigned int>(U.handle1()),
+ viennacl::cuda_arg<unsigned int>(U.handle2()),
+ viennacl::cuda_arg<NumericT>(U.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_2");
+
+ L.generate_row_block_information();
+ // Note: block information for U will be generated after transposition
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ // fill D:
+ ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()),
+ viennacl::cuda_arg<unsigned int>(A.handle2()),
+ viennacl::cuda_arg<NumericT>(A.handle()),
+ static_cast<unsigned int>(A.size1()),
+ viennacl::cuda_arg<NumericT>(D.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1");
+
+ // scale L:
+ ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ static_cast<unsigned int>(L.size1()),
+ viennacl::cuda_arg<NumericT>(D.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
+
+ // scale U:
+ ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(U.handle1()),
+ viennacl::cuda_arg<unsigned int>(U.handle2()),
+ viennacl::cuda_arg<NumericT>(U.handle()),
+ static_cast<unsigned int>(U.size1()),
+ viennacl::cuda_arg<NumericT>(D.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2");
+}
+
+/////////////////////////////////////
+
+/** @brief CUDA kernel for one Chow-Patel-ILU sweep */
+template<typename NumericT>
+__global__ void ilu_chow_patel_sweep_kernel(
+ unsigned int const *L_row_indices,
+ unsigned int const *L_col_indices,
+ NumericT *L_elements,
+ NumericT const *L_backup,
+ unsigned int L_size1,
+
+ NumericT const *aij_L,
+
+ unsigned int const *U_trans_row_indices,
+ unsigned int const *U_trans_col_indices,
+ NumericT *U_trans_elements,
+ NumericT const *U_trans_backup,
+
+ NumericT const *aij_U_trans)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < L_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ //
+ // update L:
+ //
+ unsigned int row_L_start = L_row_indices[row];
+ unsigned int row_L_end = L_row_indices[row + 1];
+
+ for (unsigned int j = row_L_start; j < row_L_end; ++j)
+ {
+ unsigned int col = L_col_indices[j];
+
+ if (col == row)
+ continue;
+
+ unsigned int row_U_start = U_trans_row_indices[col];
+ unsigned int row_U_end = U_trans_row_indices[col + 1];
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ unsigned int index_U = row_U_start;
+ unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1;
+ NumericT sum = 0;
+ for (unsigned int k = row_L_start; k < j; ++k)
+ {
+ unsigned int col_L = L_col_indices[k];
+
+ // find element in U
+ while (col_U < col_L)
+ {
+ ++index_U;
+ col_U = U_trans_col_indices[index_U];
+ }
+
+ if (col_U == col_L)
+ sum += L_backup[k] * U_trans_backup[index_U];
+ }
+
+ // update l_ij:
+ L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; // diagonal element is last entry in U
+ }
+
+
+ //
+ // update U:
+ //
+ unsigned int row_U_start = U_trans_row_indices[row];
+ unsigned int row_U_end = U_trans_row_indices[row + 1];
+ for (unsigned int j = row_U_start; j < row_U_end; ++j)
+ {
+ unsigned int col = U_trans_col_indices[j];
+
+ row_L_start = L_row_indices[col];
+ row_L_end = L_row_indices[col + 1];
+
+ // compute \sum_{k=1}^{j-1} l_ik u_kj
+ unsigned int index_L = row_L_start;
+ unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1;
+ NumericT sum = 0;
+ for (unsigned int k = row_U_start; k < j; ++k)
+ {
+ unsigned int col_U = U_trans_col_indices[k];
+
+ // find element in L
+ while (col_L < col_U)
+ {
+ ++index_L;
+ col_L = L_col_indices[index_L];
+ }
+
+ if (col_U == col_L)
+ sum += L_backup[index_L] * U_trans_backup[k];
+ }
+
+ // update u_ij:
+ U_trans_elements[j] = aij_U_trans[j] - sum;
+ }
+ }
+}
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L,
+ compressed_matrix<NumericT> & U_trans,
+ vector<NumericT> const & aij_U_trans)
+{
+ viennacl::backend::mem_handle L_backup;
+ viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+ viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+ viennacl::backend::mem_handle U_backup;
+ viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
+ viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
+
+ ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()),
+ viennacl::cuda_arg<unsigned int>(L.handle2()),
+ viennacl::cuda_arg<NumericT>(L.handle()),
+ viennacl::cuda_arg<NumericT>(L_backup),
+ static_cast<unsigned int>(L.size1()),
+
+ viennacl::cuda_arg<NumericT>(aij_L.handle()),
+
+ viennacl::cuda_arg<unsigned int>(U_trans.handle1()),
+ viennacl::cuda_arg<unsigned int>(U_trans.handle2()),
+ viennacl::cuda_arg<NumericT>(U_trans.handle()),
+ viennacl::cuda_arg<NumericT>(U_backup),
+
+ viennacl::cuda_arg<NumericT>(aij_U_trans.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_chow_patel_sweep_kernel");
+
+}
+
+//////////////////////////////////////
+
+template<typename NumericT>
+__global__ void ilu_form_neumann_matrix_kernel(
+ unsigned int const *R_row_indices,
+ unsigned int const *R_col_indices,
+ NumericT *R_elements,
+ unsigned int R_size1,
+
+ NumericT *D_elements)
+{
+ for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
+ row < R_size1;
+ row += gridDim.x * blockDim.x)
+ {
+ unsigned int row_begin = R_row_indices[row];
+ unsigned int row_end = R_row_indices[row+1];
+
+ // part 1: extract diagonal entry
+ NumericT diag = 0;
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ {
+ unsigned int col = R_col_indices[j];
+ if (col == row)
+ {
+ diag = R_elements[j];
+ R_elements[j] = 0; // (I - D^{-1}R)
+ break;
+ }
+ }
+ D_elements[row] = diag;
+
+ // part2: scale
+ for (unsigned int j = row_begin; j < row_end; ++j)
+ R_elements[j] /= -diag;
+ }
+}
+
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+ vector<NumericT> & diag_R)
+{
+ ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(R.handle1()),
+ viennacl::cuda_arg<unsigned int>(R.handle2()),
+ viennacl::cuda_arg<NumericT>(R.handle()),
+ static_cast<unsigned int>(R.size1()),
+ viennacl::cuda_arg<NumericT>(diag_R.handle())
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_form_neumann_matrix_kernel");
+}
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif