You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:32 UTC

[35/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
new file mode 100644
index 0000000..912d24d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp
@@ -0,0 +1,2725 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file  viennacl/linalg/cuda/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations, including matrix-vector products, using CUDA.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+#include "viennacl/linalg/cuda/vector_operations.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_row.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_col.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+#include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2)
+{
+  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  if (mat1.row_major())
+  {
+    convert_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                    viennacl::cuda_arg(mat2),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("convert_row_kernel");
+  }
+  else
+  {
+    convert_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                    viennacl::cuda_arg(mat2),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("convert_col_kernel");
+  }
+}
+
+template<typename NumericT, typename SizeT, typename DistanceT>
+void trans(matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> const & proxy,
+           matrix_base<NumericT> & temp_trans)
+{
+  trans_kernel<<<128,128>>>(viennacl::cuda_arg(proxy.lhs()),
+                            static_cast<unsigned int>(proxy.lhs().start1()),          static_cast<unsigned int>(proxy.lhs().start2()),
+                            static_cast<unsigned int>(proxy.lhs().internal_size1()),  static_cast<unsigned int>(proxy.lhs().internal_size2()),
+                            static_cast<unsigned int>(proxy.lhs().size1()),           static_cast<unsigned int>(proxy.lhs().size2()),
+                            static_cast<unsigned int>(proxy.lhs().stride1()),         static_cast<unsigned int>(proxy.lhs().stride2()),
+
+                            viennacl::cuda_arg(temp_trans),
+                            static_cast<unsigned int>(temp_trans.start1()),            static_cast<unsigned int>(temp_trans.start2()),
+                            static_cast<unsigned int>(temp_trans.internal_size1()),    static_cast<unsigned int>(temp_trans.internal_size2()),
+                            static_cast<unsigned int>(temp_trans.stride1()),           static_cast<unsigned int>(temp_trans.stride2()),
+                            static_cast<bool>(proxy.lhs().row_major()));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("trans_kernel");
+}
+
+
+template<typename NumericT, typename ScalarT>
+void am(matrix_base<NumericT> & mat1,
+        matrix_base<NumericT> const & mat2, ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT>::value)
+    temporary_alpha = alpha;
+
+  if (mat1.row_major())
+  {
+    am_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                options_alpha,
+                                viennacl::cuda_arg(mat2),
+                                static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                              );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
+  }
+  else
+  {
+    am_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                options_alpha,
+                                viennacl::cuda_arg(mat2),
+                                static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
+                              );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
+  }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+          matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+          matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<ScalarT2>::value)
+    temporary_beta = beta;
+
+
+  if (mat1.row_major())
+  {
+    ambm_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                  options_alpha,
+                                  viennacl::cuda_arg(mat2),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                  options_beta,
+                                  viennacl::cuda_arg(mat3),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
+  }
+  else
+  {
+    ambm_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                  viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                  options_alpha,
+                                  viennacl::cuda_arg(mat2),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                  viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                  options_beta,
+                                  viennacl::cuda_arg(mat3),
+                                  static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                  static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
+  }
+
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+            matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+            matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<ScalarT2>::value)
+    temporary_beta = beta;
+
+
+  if (mat1.row_major())
+  {
+    ambm_m_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                    viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                    options_alpha,
+                                    viennacl::cuda_arg(mat2),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                    viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                    options_beta,
+                                    viennacl::cuda_arg(mat3),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
+  }
+  else
+  {
+    ambm_m_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
+
+                                    viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                                    options_alpha,
+                                    viennacl::cuda_arg(mat2),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
+
+                                    viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                                    options_beta,
+                                    viennacl::cuda_arg(mat3),
+                                    static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
+                                    static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
+                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
+                                  );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
+  }
+
+}
+
+
+
+
+template<typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+  typedef NumericT        value_type;
+  value_type alpha = s;
+
+  unsigned int s1  = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+  unsigned int s2  = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+
+  if (mat.row_major())
+  {
+
+    matrix_row_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                           s1,                                                                 s2,
+                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                           alpha);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
+  }
+  else
+  {
+    matrix_col_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                                            static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                            static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                            s1,                                                                 s2,
+                                            static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                            alpha);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
+  }
+}
+
+template<typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+  typedef NumericT        value_type;
+  value_type alpha = s;
+
+  if (mat.row_major())
+  {
+    matrix_row_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                                                    static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                    alpha);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
+  }
+  else
+  {
+    matrix_col_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                                                    static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
+                                                    static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
+                                                    alpha);
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
+  }
+}
+
+
+template<typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+  typedef NumericT        value_type;
+
+  // Step 1: assign zero matrix:
+  matrix_assign(mat, NumericT(0));
+
+  // Step 2: Assign diagonal:
+  unsigned int options_alpha = 0;
+
+  vcl_size_t mat_start = 0;
+  vcl_size_t mat_stride = 0;
+  vcl_size_t mat_size = viennacl::traits::size(vec);
+  if (mat.row_major())
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+    mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+  }
+  else
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+  }
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(mat),
+                          static_cast<unsigned int>(mat_start),
+                          static_cast<unsigned int>(mat_stride),
+                          static_cast<unsigned int>(mat_size),
+
+                          viennacl::cuda_arg<value_type>(NumericT(1)),
+                          options_alpha,
+                          viennacl::cuda_arg(vec),
+                          static_cast<unsigned int>(viennacl::traits::start(vec)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_diag_to_vector(matrix_base<NumericT> const & mat, int k, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = 0;
+
+  vcl_size_t mat_start = 0;
+  vcl_size_t mat_stride = 0;
+  if (mat.row_major())
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
+                 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
+    mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
+  }
+  else
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
+                 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
+  }
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+                          static_cast<unsigned int>(viennacl::traits::start(vec)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                          static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                          viennacl::cuda_arg<value_type>(NumericT(1)),
+                          options_alpha,
+                          viennacl::cuda_arg(mat),
+                          static_cast<unsigned int>(mat_start),
+                          static_cast<unsigned int>(mat_stride));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = 0;
+
+  vcl_size_t mat_start = 0;
+  vcl_size_t mat_stride = 0;
+  if (mat.row_major())
+  {
+    mat_start  = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat);
+    mat_stride = viennacl::traits::stride2(mat);
+  }
+  else
+  {
+    mat_start  = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat);
+    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat);
+  }
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+                          static_cast<unsigned int>(viennacl::traits::start(vec)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                          static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                          viennacl::cuda_arg<value_type>(NumericT(1)),
+                          options_alpha,
+                          viennacl::cuda_arg(mat),
+                          static_cast<unsigned int>(mat_start),
+                          static_cast<unsigned int>(mat_stride));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+template<typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = 0;
+
+  vcl_size_t mat_start = 0;
+  vcl_size_t mat_stride = 0;
+  if (mat.row_major())
+  {
+    mat_start  = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat);
+    mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat);
+  }
+  else
+  {
+    mat_start  = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
+    mat_stride = viennacl::traits::stride2(mat);
+  }
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec),
+                          static_cast<unsigned int>(viennacl::traits::start(vec)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec)),
+                          static_cast<unsigned int>(viennacl::traits::size(vec)),
+
+                          viennacl::cuda_arg<value_type>(NumericT(1)),
+                          options_alpha,
+                          viennacl::cuda_arg(mat),
+                          static_cast<unsigned int>(mat_start),
+                          static_cast<unsigned int>(mat_stride));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+
+//
+/////////////////////////   binary element-wise operations    /////////////////////////////////
+//
+
+
+template<typename NumericT, typename SizeT, typename OpT>
+void element_op(matrix_base<NumericT, SizeT> & A,
+                matrix_expression<const matrix_base<NumericT, SizeT>, const matrix_base<NumericT, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT      value_type;
+
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  if (A.row_major())
+  {
+    element_op_int_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+  }
+  else
+  {
+    element_op_int_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+  }
+}
+
+template<typename SizeT, typename OpT>
+void element_op(matrix_base<float, SizeT> & A,
+                matrix_expression<const matrix_base<float, SizeT>, const matrix_base<float, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef float        value_type;
+
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  if (A.row_major())
+  {
+    element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+  }
+  else
+  {
+    element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+  }
+}
+
+template<typename SizeT, typename OpT>
+void element_op(matrix_base<double, SizeT> & A,
+                matrix_expression<const matrix_base<double, SizeT>, const matrix_base<double, SizeT>, op_element_binary<OpT> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef double        value_type;
+
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  if (A.row_major())
+  {
+    element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
+  }
+  else
+  {
+    element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+                                        static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
+
+                                        viennacl::cuda_arg(proxy.rhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
+                                        static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
+
+                                        op_type
+                                      );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
+  }
+}
+
+//
+/////////////////////////   unary element-wise operations    /////////////////////////////////
+//
+
+// Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
+//       we could not find a more 'automatic' way of generating the overloads below...
+
+// abs
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_abs> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
+  }
+  else
+  {
+    matrix_col_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
+  }
+}
+
+
+// acos
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_acos> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT    value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
+  }
+  else
+  {
+    matrix_col_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
+  }
+}
+
+
+// asin
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_asin> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT    value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
+  }
+  else
+  {
+    matrix_col_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+  }
+}
+
+
+// atan
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_atan> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
+  }
+  else
+  {
+    matrix_col_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
+  }
+}
+
+
+// ceil
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
+  }
+  else
+  {
+    matrix_col_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
+  }
+}
+
+
+// cos
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cos> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
+  }
+  else
+  {
+    matrix_col_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
+  }
+}
+
+
+// cosh
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT  value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
+  }
+  else
+  {
+    matrix_col_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
+  }
+}
+
+
+// exp
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_exp> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT  value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
+  }
+  else
+  {
+    matrix_col_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
+  }
+}
+
+
+// fabs
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
+  }
+  else
+  {
+    matrix_col_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
+  }
+}
+
+
+// floor
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_floor> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT    value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
+  }
+  else
+  {
+    matrix_col_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
+  }
+}
+
+
+// log
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT  value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
+  }
+  else
+  {
+    matrix_col_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
+  }
+}
+
+
+// log10
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log10> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
+  }
+  else
+  {
+    matrix_col_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
+  }
+}
+
+
+// sin
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sin> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT  value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
+  }
+  else
+  {
+    matrix_col_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+      static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+      static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+      static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+      viennacl::cuda_arg(proxy.lhs()),
+      static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+      static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
+  }
+}
+
+
+// sinh
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
+  }
+  else
+  {
+    matrix_col_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
+
+     viennacl::cuda_arg(proxy.lhs()),
+     static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
+     static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
+    );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
+  }
+}
+
+
+// sqrt
+template<typename NumericT>
+void element_op(matrix_base<NumericT> & A,
+                matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
+{
+  assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+  typedef NumericT   value_type;
+
+  if (A.row_major())
+  {
+    matrix_row_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(A),
+     static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
+     static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
+     static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
+     static_cast<unsigned i

<TRUNCATED>