You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:32 UTC
[35/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
new file mode 100644
index 0000000..912d24d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
@@ -0,0 +1,2725 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/matrix_operations.hpp
+ @brief Implementations of dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/linalg/cuda/vector_operations.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_row.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_col.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2)
+{
+ assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ if (mat1.row_major())
+ {
+ convert_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("convert_row_kernel");
+ }
+ else
+ {
+ convert_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("convert_col_kernel");
+ }
+}
+
+template<typename NumericT, typename SizeT, typename DistanceT>
+void trans(matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> const & proxy,
+ matrix_base<NumericT> & temp_trans)
+{
+ trans_kernel<<<128,128>>>(viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(proxy.lhs().start1()), static_cast<unsigned int>(proxy.lhs().start2()),
+ static_cast<unsigned int>(proxy.lhs().internal_size1()), static_cast<unsigned int>(proxy.lhs().internal_size2()),
+ static_cast<unsigned int>(proxy.lhs().size1()), static_cast<unsigned int>(proxy.lhs().size2()),
+ static_cast<unsigned int>(proxy.lhs().stride1()), static_cast<unsigned int>(proxy.lhs().stride2()),
+
+ viennacl::cuda_arg(temp_trans),
+ static_cast<unsigned int>(temp_trans.start1()), static_cast<unsigned int>(temp_trans.start2()),
+ static_cast<unsigned int>(temp_trans.internal_size1()), static_cast<unsigned int>(temp_trans.internal_size2()),
+ static_cast<unsigned int>(temp_trans.stride1()), static_cast<unsigned int>(temp_trans.stride2()),
+ static_cast<bool>(proxy.lhs().row_major()));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("trans_kernel");
+}
+
+
+template<typename NumericT, typename ScalarT>
+void am(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT>::value)
+ temporary_alpha = alpha;
+
+ if (mat1.row_major())
+ {
+ am_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
+ }
+ else
+ {
+ am_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
+ }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<ScalarT2>::value)
+ temporary_beta = beta;
+
+
+ if (mat1.row_major())
+ {
+ ambm_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(mat3),
+ static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
+ }
+ else
+ {
+ ambm_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(mat3),
+ static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
+ }
+
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<ScalarT2>::value)
+ temporary_beta = beta;
+
+
+ if (mat1.row_major())
+ {
+ ambm_m_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(mat3),
+ static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
+ }
+ else
+ {
+ ambm_m_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+ static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(mat2),
+ static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(mat3),
+ static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
+ }
+
+}
+
+
+
+
+template<typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+ typedef NumericT value_type;
+ value_type alpha = s;
+
+ unsigned int s1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+ unsigned int s2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+
+ if (mat.row_major())
+ {
+
+ matrix_row_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ s1, s2,
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ alpha);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
+ }
+ else
+ {
+ matrix_col_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ s1, s2,
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ alpha);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
+ }
+}
+
+template<typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+ typedef NumericT value_type;
+ value_type alpha = s;
+
+ if (mat.row_major())
+ {
+ matrix_row_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ alpha);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
+ }
+ else
+ {
+ matrix_col_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)),
+ static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+ static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+ alpha);
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
+ }
+}
+
+
+template<typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+ typedef NumericT value_type;
+
+ // Step 1: assign zero matrix:
+ matrix_assign(mat, NumericT(0));
+
+ // Step 2: Assign diagonal:
+ unsigned int options_alpha = 0;
+
+ vcl_size_t mat_start = 0;
+ vcl_size_t mat_stride = 0;
+ vcl_size_t mat_size = viennacl::traits::size(vec);
+ if (mat.row_major())
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+ + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+ mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+ }
+ else
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+ + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+ mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+ }
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(mat_start),
+ static_cast<unsigned int>(mat_stride),
+ static_cast<unsigned int>(mat_size),
+
+ viennacl::cuda_arg<value_type>(NumericT(1)),
+ options_alpha,
+ viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_diag_to_vector(matrix_base<NumericT> const & mat, int k, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = 0;
+
+ vcl_size_t mat_start = 0;
+ vcl_size_t mat_stride = 0;
+ if (mat.row_major())
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+ + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+ mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+ }
+ else
+ {
+ vcl_size_t first_row_index = 0;
+ vcl_size_t first_col_index = 0;
+ if (k < 0)
+ first_row_index = vcl_size_t(-k);
+ else
+ first_col_index = vcl_size_t(k);
+ mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+ + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+ mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+ }
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)),
+ static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+ viennacl::cuda_arg<value_type>(NumericT(1)),
+ options_alpha,
+ viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(mat_start),
+ static_cast<unsigned int>(mat_stride));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = 0;
+
+ vcl_size_t mat_start = 0;
+ vcl_size_t mat_stride = 0;
+ if (mat.row_major())
+ {
+ mat_start = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat);
+ mat_stride = viennacl::traits::stride2(mat);
+ }
+ else
+ {
+ mat_start = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat);
+ mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat);
+ }
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)),
+ static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+ viennacl::cuda_arg<value_type>(NumericT(1)),
+ options_alpha,
+ viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(mat_start),
+ static_cast<unsigned int>(mat_stride));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = 0;
+
+ vcl_size_t mat_start = 0;
+ vcl_size_t mat_stride = 0;
+ if (mat.row_major())
+ {
+ mat_start = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat);
+ mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat);
+ }
+ else
+ {
+ mat_start = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+ mat_stride = viennacl::traits::stride2(mat);
+ }
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+ static_cast<unsigned int>(viennacl::traits::start(vec)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec)),
+ static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+ viennacl::cuda_arg<value_type>(NumericT(1)),
+ options_alpha,
+ viennacl::cuda_arg(mat),
+ static_cast<unsigned int>(mat_start),
+ static_cast<unsigned int>(mat_stride));
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+
+//
+///////////////////////// binary element-wise operations /////////////////////////////////
+//
+
+
+template<typename NumericT, typename SizeT, typename OpT>
+void element_op(matrix_base<NumericT, SizeT> & A,
+ matrix_expression<const matrix_base<NumericT, SizeT>, const matrix_base<NumericT, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ if (A.row_major())
+ {
+ element_op_int_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+ }
+ else
+ {
+ element_op_int_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+ }
+}
+
+template<typename SizeT, typename OpT>
+void element_op(matrix_base<float, SizeT> & A,
+ matrix_expression<const matrix_base<float, SizeT>, const matrix_base<float, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef float value_type;
+
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ if (A.row_major())
+ {
+ element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+ }
+ else
+ {
+ element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+ }
+}
+
+template<typename SizeT, typename OpT>
+void element_op(matrix_base<double, SizeT> & A,
+ matrix_expression<const matrix_base<double, SizeT>, const matrix_base<double, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef double value_type;
+
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ if (A.row_major())
+ {
+ element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+ }
+ else
+ {
+ element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+ }
+}
+
+//
+///////////////////////// unary element-wise operations /////////////////////////////////
+//
+
+// Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
+// we could not find a more 'automatic' way of generating the overloads below...
+
+// abs
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_abs> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
+ }
+ else
+ {
+ matrix_col_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
+ }
+}
+
+
+// acos
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_acos> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
+ }
+ else
+ {
+ matrix_col_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
+ }
+}
+
+
+// asin
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_asin> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
+ }
+ else
+ {
+ matrix_col_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+ }
+}
+
+
+// atan
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_atan> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
+ }
+ else
+ {
+ matrix_col_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
+ }
+}
+
+
+// ceil
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
+ }
+ else
+ {
+ matrix_col_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
+ }
+}
+
+
+// cos
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cos> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
+ }
+ else
+ {
+ matrix_col_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
+ }
+}
+
+
+// cosh
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
+ }
+ else
+ {
+ matrix_col_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
+ }
+}
+
+
+// exp
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_exp> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
+ }
+ else
+ {
+ matrix_col_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
+ }
+}
+
+
+// fabs
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
+ }
+ else
+ {
+ matrix_col_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
+ }
+}
+
+
+// floor
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_floor> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
+ }
+ else
+ {
+ matrix_col_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
+ }
+}
+
+
+// log
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
+ }
+ else
+ {
+ matrix_col_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
+ }
+}
+
+
+// log10
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log10> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
+ }
+ else
+ {
+ matrix_col_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
+ }
+}
+
+
+// sin
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sin> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
+ }
+ else
+ {
+ matrix_col_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+ }
+}
+
+
+// sinh
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
+ }
+ else
+ {
+ matrix_col_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
+ }
+}
+
+
+// sqrt
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ if (A.row_major())
+ {
+ matrix_row_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+ static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)),
+ static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)),
+ static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)),
+ static_cast<unsigned i
<TRUNCATED>