You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:17 UTC
[20/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
new file mode 100644
index 0000000..f23223f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp
@@ -0,0 +1,2052 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/matrix_operations.hpp
+ @brief Implementations of dense matrix related operations, including matrix-vector products, using a plain single-threaded or OpenMP-enabled execution on CPU.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/prod.hpp"
+
+// Minimum Matrix size(size1*size2) for using OpenMP on matrix operations:
+#ifndef VIENNACL_OPENMP_MATRIX_MIN_SIZE
+ #define VIENNACL_OPENMP_MATRIX_MIN_SIZE 5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2)
+{
+ assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ DestNumericT * data_A = detail::extract_raw_pointer<DestNumericT>(mat1);
+ SrcNumericT const * data_B = detail::extract_raw_pointer<SrcNumericT>(mat2);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+ vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
+
+ if (mat1.row_major())
+ {
+ detail::matrix_array_wrapper<DestNumericT, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<SrcNumericT const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<DestNumericT, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<SrcNumericT const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col));
+ }
+}
+
+
+
+template<typename NumericT,
+ typename SizeT, typename DistanceT>
+void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,
+ const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy, matrix_base<NumericT> & temp_trans)
+{
+ typedef NumericT value_type;
+ const value_type * data_A = detail::extract_raw_pointer<value_type>(proxy.lhs());
+ value_type * data_B = detail::extract_raw_pointer<value_type>(temp_trans);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(proxy.lhs());
+ vcl_size_t A_start2 = viennacl::traits::start2(proxy.lhs());
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
+ vcl_size_t A_inc1 = viennacl::traits::stride1(proxy.lhs());
+ vcl_size_t A_inc2 = viennacl::traits::stride2(proxy.lhs());
+ vcl_size_t A_size1 = viennacl::traits::size1(proxy.lhs());
+ vcl_size_t A_size2 = viennacl::traits::size2(proxy.lhs());
+
+ vcl_size_t B_start1 = viennacl::traits::start1(temp_trans);
+ vcl_size_t B_start2 = viennacl::traits::start2(temp_trans);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(temp_trans);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(temp_trans);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(temp_trans);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(temp_trans);
+
+ const vcl_size_t sub_mat_size = 64; //The matrix will be divided into sub-matrices for better storage access.
+
+ vcl_size_t row_count = A_size1 / sub_mat_size;
+ vcl_size_t col_count = A_size2 / sub_mat_size;
+
+ vcl_size_t row_count_remainder = A_size1 % sub_mat_size;
+ vcl_size_t col_count_remainder = A_size2 % sub_mat_size;
+
+ if (proxy.lhs().row_major())
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition
+ {
+ vcl_size_t row = vcl_size_t(i) / col_count;
+ vcl_size_t col = vcl_size_t(i) % col_count;
+
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size)
+ , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size)
+ , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < (sub_mat_size); ++j)
+ for(vcl_size_t k = 0; k < (sub_mat_size); ++k)
+ wrapper_B(j, k) = wrapper_A(k, j);
+ }
+ { //This is the transposition of the remainder on the right side of the matrix
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1
+ , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col_count * sub_mat_size)
+ , B_start2, B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < col_count_remainder; ++j)
+ for(vcl_size_t k = 0 ; k < A_size1; ++k)
+ wrapper_B(j, k) = wrapper_A(k, j);
+ }
+ { //This is the transposition of the remainder on the bottom side of the matrix
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size)
+ , A_start2, A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B,B_start1
+ , B_start2 + B_inc2 * (row_count * sub_mat_size), B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < row_count_remainder; ++j)
+ for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k)
+ wrapper_B(k, j) = wrapper_A(j, k);
+ }
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition
+ {
+ vcl_size_t row = vcl_size_t(i) / col_count;
+ vcl_size_t col = vcl_size_t(i) % col_count;
+
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size)
+ , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size)
+ , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < (sub_mat_size); ++j)
+ for(vcl_size_t k = 0; k < (sub_mat_size); ++k)
+ wrapper_B(k, j)=wrapper_A(j, k);
+ }
+ { //This is the transposition of the remainder on the right side of the matrix
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1
+ , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B,B_start1 + B_inc1 * (col_count * sub_mat_size)
+ , B_start2, B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < col_count_remainder; ++j)
+ for(vcl_size_t k = 0; k < A_size1; ++k)
+ wrapper_B(j, k)=wrapper_A(k, j);
+ }
+ { //This is the transposition of the remainder on the bottom side of the matrix
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size)
+ , A_start2, A_inc1
+ , A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B, B_start1
+ , B_start2 + B_inc2 * (row_count * sub_mat_size), B_inc1
+ , B_inc2, B_internal_size1, B_internal_size2);
+ for(vcl_size_t j = 0; j < row_count_remainder; ++j)
+ for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k)
+ wrapper_B(k, j)=wrapper_A(j, k);
+ }
+ }
+}
+
+template<typename NumericT, typename ScalarT1>
+void am(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+ vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
+
+ if (mat1.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+ if (reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
+ }
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+ if (reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha;
+ }
+ }
+}
+
+
+template<typename NumericT,
+ typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+ value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = -data_beta;
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+ vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
+
+ vcl_size_t C_start1 = viennacl::traits::start1(mat3);
+ vcl_size_t C_start2 = viennacl::traits::start2(mat3);
+ vcl_size_t C_inc1 = viennacl::traits::stride1(mat3);
+ vcl_size_t C_inc2 = viennacl::traits::stride2(mat3);
+ vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3);
+ vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3);
+
+ if (mat1.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ if (reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ else if (!reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (!reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ if (reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ else if (!reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (!reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ }
+
+}
+
+
+template<typename NumericT,
+ typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+ matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
+{
+ assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2);
+ value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = -data_beta;
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(mat2);
+ vcl_size_t B_start2 = viennacl::traits::start2(mat2);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(mat2);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(mat2);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2);
+
+ vcl_size_t C_start1 = viennacl::traits::start1(mat3);
+ vcl_size_t C_start2 = viennacl::traits::start2(mat3);
+ vcl_size_t C_inc1 = viennacl::traits::stride1(mat3);
+ vcl_size_t C_inc2 = viennacl::traits::stride2(mat3);
+ vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3);
+ vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3);
+
+ if (mat1.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ if (reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ else if (!reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (!reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ if (reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ else if (!reciprocal_alpha && reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta;
+ }
+ else if (!reciprocal_alpha && !reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta;
+ }
+ }
+
+}
+
+
+
+
+template<typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type alpha = static_cast<value_type>(s);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ vcl_size_t A_size1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
+ vcl_size_t A_size2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_A(static_cast<vcl_size_t>(row), col) = alpha;
+ //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+ // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_A(row, static_cast<vcl_size_t>(col)) = alpha;
+ //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+ // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha;
+ }
+}
+
+
+
+template<typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type alpha = static_cast<value_type>(s);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ wrapper_A(row, row) = alpha;
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ wrapper_A(row, row) = alpha;
+ }
+}
+
+template<typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat)
+{
+ typedef NumericT value_type;
+
+ value_type *data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type const *data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ //vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t v_start = viennacl::traits::start(vec);
+ vcl_size_t v_inc = viennacl::traits::stride(vec);
+ vcl_size_t v_size = viennacl::traits::size(vec);
+
+ vcl_size_t row_start = 0;
+ vcl_size_t col_start = 0;
+
+ if (k >= 0)
+ col_start = static_cast<vcl_size_t>(k);
+ else
+ row_start = static_cast<vcl_size_t>(-k);
+
+ matrix_assign(mat, NumericT(0));
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc];
+ }
+}
+
+template<typename NumericT>
+void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ //vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t v_start = viennacl::traits::start(vec);
+ vcl_size_t v_inc = viennacl::traits::stride(vec);
+ vcl_size_t v_size = viennacl::traits::size(vec);
+
+ vcl_size_t row_start = 0;
+ vcl_size_t col_start = 0;
+
+ if (k >= 0)
+ col_start = static_cast<vcl_size_t>(k);
+ else
+ row_start = static_cast<vcl_size_t>(-k);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i);
+ }
+}
+
+template<typename NumericT>
+void matrix_row(const matrix_base<NumericT> & mat, unsigned int i, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ //vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t v_start = viennacl::traits::start(vec);
+ vcl_size_t v_inc = viennacl::traits::stride(vec);
+ vcl_size_t v_size = viennacl::traits::size(vec);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t j = 0; j < v_size; ++j)
+ data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j);
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t j = 0; j < v_size; ++j)
+ data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j);
+ }
+}
+
+template<typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type * data_vec = detail::extract_raw_pointer<value_type>(vec);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ //vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ //vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t v_start = viennacl::traits::start(vec);
+ vcl_size_t v_inc = viennacl::traits::stride(vec);
+ vcl_size_t v_size = viennacl::traits::size(vec);
+
+ if (mat.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+
+ for (vcl_size_t i = 0; i < v_size; ++i)
+ data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j));
+ }
+}
+
+//
+///////////////////////// Element-wise operation //////////////////////////////////
+//
+
+// Binary operations A = B .* C and A = B ./ C
+
+/** @brief Implementation of the element-wise operations A = B .* C and A = B ./ C (using MATLAB syntax)
+*
+* @param A The result matrix (or -range, or -slice)
+* @param proxy The proxy object holding B, C, and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+ typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> > OpFunctor;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(A);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
+ value_type const * data_C = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+ vcl_size_t A_start1 = viennacl::traits::start1(A);
+ vcl_size_t A_start2 = viennacl::traits::start2(A);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(A);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(A);
+ vcl_size_t A_size1 = viennacl::traits::size1(A);
+ vcl_size_t A_size2 = viennacl::traits::size2(A);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
+ vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
+ vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs());
+ vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs());
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
+
+ vcl_size_t C_start1 = viennacl::traits::start1(proxy.rhs());
+ vcl_size_t C_start2 = viennacl::traits::start2(proxy.rhs());
+ vcl_size_t C_inc1 = viennacl::traits::stride1(proxy.rhs());
+ vcl_size_t C_inc2 = viennacl::traits::stride2(proxy.rhs());
+ vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(proxy.rhs());
+ vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(proxy.rhs());
+
+ if (A.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
+ //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+ // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
+ // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col));
+
+ //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)]
+ // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha
+ // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta;
+ }
+}
+
+// Unary operations
+
+// A = op(B)
+template<typename NumericT, typename OpT>
+void element_op(matrix_base<NumericT> & A,
+ matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<OpT> > const & proxy)
+{
+ assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!"));
+
+ typedef NumericT value_type;
+ typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> > OpFunctor;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(A);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+ vcl_size_t A_start1 = viennacl::traits::start1(A);
+ vcl_size_t A_start2 = viennacl::traits::start2(A);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(A);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(A);
+ vcl_size_t A_size1 = viennacl::traits::size1(A);
+ vcl_size_t A_size2 = viennacl::traits::size2(A);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs());
+ vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs());
+ vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs());
+ vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs());
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs());
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs());
+
+ if (A.row_major())
+ {
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col));
+ }
+}
+
+
+
+//
+///////////////////////// matrix-vector products /////////////////////////////////
+//
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat The matrix
+* @param trans Flag whether mat is to be transposed
+* @param vec The vector
+* @param result The result vector
+*/
+template<typename NumericT>
+void prod_impl(const matrix_base<NumericT> & mat, bool trans,
+ const vector_base<NumericT> & vec,
+ vector_base<NumericT> & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(mat);
+ value_type const * data_x = detail::extract_raw_pointer<value_type>(vec);
+ value_type * data_result = detail::extract_raw_pointer<value_type>(result);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat);
+
+ vcl_size_t start1 = viennacl::traits::start(vec);
+ vcl_size_t inc1 = viennacl::traits::stride(vec);
+
+ vcl_size_t start2 = viennacl::traits::start(result);
+ vcl_size_t inc2 = viennacl::traits::stride(result);
+
+ if (mat.row_major())
+ {
+ if (trans)
+ {
+ vcl_size_t thread_count = 1;
+#ifdef VIENNACL_WITH_OPENMP
+ if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+ std::vector<value_type> temp_array(A_size2*thread_count, 0);
+ detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2);
+
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_res(col) = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+ id = omp_get_thread_num();
+ #endif
+ vcl_size_t begin = (A_size1 * id) / thread_count;
+ vcl_size_t end = (A_size1 * (id + 1)) / thread_count;
+
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_mat(data_A, A_start1 + A_inc1 * begin, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1);
+
+ for (vcl_size_t row = 0; row < (end - begin); ++row) //run through matrix sequentially
+ {
+ value_type temp = wrapper_vec(row);
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ temp_array[A_size2 * id + col] += wrapper_mat(row , col) * temp;
+ }
+ }
+ for (vcl_size_t id = 0; id < thread_count; ++id)
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ wrapper_res(col) += temp_array[A_size2 * id + col];
+ }
+
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ {
+ value_type temp = 0;
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ temp += data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
+
+ data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp;
+ }
+ }
+ }
+ else
+ {
+ if (!trans)
+ {
+ vcl_size_t thread_count = 1;
+#ifdef VIENNACL_WITH_OPENMP
+ if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+ std::vector<value_type> temp_array(A_size1*thread_count, 0);
+ detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2);
+
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_res(row) = 0;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+ id = omp_get_thread_num();
+ #endif
+ vcl_size_t begin = (A_size2 * id) / thread_count;
+ vcl_size_t end = (A_size2 * (id + 1)) / thread_count;
+
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_mat(data_A, A_start1, A_start2 + A_inc2 * begin, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1);
+
+ for (vcl_size_t col = 0; col < (end - begin); ++col) //run through matrix sequentially
+ {
+ value_type temp = wrapper_vec(col);
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ temp_array[A_size1 * id + row] += wrapper_mat(row , col) * temp;
+ }
+ }
+ for (vcl_size_t id = 0; id < thread_count; ++id)
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ wrapper_res(row) += temp_array[A_size1 * id + row];
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size2); ++row)
+ {
+ value_type temp = 0;
+ for (vcl_size_t col = 0; col < A_size1; ++col)
+ temp += data_A[viennacl::column_major::mem_index(col * A_inc1 + A_start1, static_cast<vcl_size_t>(row) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1];
+
+ data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp;
+ }
+ }
+ }
+}
+
+
+
+//
+///////////////////////// matrix-matrix products /////////////////////////////////
+//
+
+namespace detail
+{
+ template<typename MatrixAccT1, typename MatrixAccT2, typename MatrixAccT3, typename NumericT>
+ void prod(MatrixAccT1 & A, MatrixAccT2 & B, MatrixAccT3 & C,
+ vcl_size_t C_size1, vcl_size_t C_size2, vcl_size_t A_size2,
+ NumericT alpha, NumericT beta)
+ {
+ if (C_size1 == 0 || C_size2 == 0 || A_size2 == 0)
+ return;
+
+ static const vcl_size_t blocksize = 64;
+
+ vcl_size_t num_blocks_C1 = (C_size1 - 1) / blocksize + 1;
+ vcl_size_t num_blocks_C2 = (C_size2 - 1) / blocksize + 1;
+ vcl_size_t num_blocks_A2 = (A_size2 - 1) / blocksize + 1;
+
+ //
+ // outer loop pair: Run over all blocks with indices (block_idx_i, block_idx_j) of the result matrix C:
+ //
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((C_size1*C_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long block_idx_i2=0; block_idx_i2<static_cast<long>(num_blocks_C1); ++block_idx_i2)
+ {
+ // thread-local auxiliary buffers
+ std::vector<NumericT> buffer_A(blocksize * blocksize); // row-major
+ std::vector<NumericT> buffer_B(blocksize * blocksize); // column-major
+ std::vector<NumericT> buffer_C(blocksize * blocksize); // row-major
+
+ vcl_size_t block_idx_i = static_cast<vcl_size_t>(block_idx_i2);
+ for (vcl_size_t block_idx_j=0; block_idx_j<num_blocks_C2; ++block_idx_j)
+ {
+ // Reset block matrix:
+ std::fill(buffer_C.begin(), buffer_C.end(), NumericT(0));
+
+ vcl_size_t offset_i = block_idx_i*blocksize;
+ vcl_size_t offset_j = block_idx_j*blocksize;
+
+ // C(block_idx_i, block_idx_i) += A(block_idx_i, block_idx_k) * B(block_idx_k, block_idx_j)
+ for (vcl_size_t block_idx_k=0; block_idx_k<num_blocks_A2; ++block_idx_k)
+ {
+ // flush buffers:
+ std::fill(buffer_A.begin(), buffer_A.end(), NumericT(0));
+ std::fill(buffer_B.begin(), buffer_B.end(), NumericT(0));
+
+ vcl_size_t offset_k = block_idx_k*blocksize;
+
+ // load current data:
+ for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+ for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k)
+ buffer_A[(i - offset_i) * blocksize + (k - offset_k)] = A(i, k);
+
+ for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+ for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k)
+ buffer_B[(k - offset_k) + (j - offset_j) * blocksize] = B(k, j);
+
+ // multiply (this is the hot spot in terms of flops)
+ for (vcl_size_t i = 0; i < blocksize; ++i)
+ {
+ NumericT const * ptrA = &(buffer_A[i*blocksize]);
+ for (vcl_size_t j = 0; j < blocksize; ++j)
+ {
+ NumericT const * ptrB = &(buffer_B[j*blocksize]);
+
+ NumericT temp = NumericT(0);
+ for (vcl_size_t k = 0; k < blocksize; ++k)
+ temp += ptrA[k] * ptrB[k]; // buffer_A[i*blocksize + k] * buffer_B[k + j*blocksize];
+
+ buffer_C[i*blocksize + j] += temp;
+ }
+ }
+ }
+
+ // write result:
+ if (beta > 0 || beta < 0)
+ {
+ for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+ for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+ C(i,j) = beta * C(i,j) + alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)];
+ }
+ else
+ {
+ for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i)
+ for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j)
+ C(i,j) = alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)];
+ }
+
+ } // for block j
+ } // for block i
+
+ } // prod()
+
+} // namespace detail
+
+/** @brief Carries out matrix-matrix multiplication
+*
+* Implementation of C = prod(A, B);
+*
+*/
+template<typename NumericT, typename ScalarT1, typename ScalarT2 >
+void prod_impl(const matrix_base<NumericT> & A, bool trans_A,
+ const matrix_base<NumericT> & B, bool trans_B,
+ matrix_base<NumericT> & C,
+ ScalarT1 alpha,
+ ScalarT2 beta)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_A = detail::extract_raw_pointer<value_type>(A);
+ value_type const * data_B = detail::extract_raw_pointer<value_type>(B);
+ value_type * data_C = detail::extract_raw_pointer<value_type>(C);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(A);
+ vcl_size_t A_start2 = viennacl::traits::start2(A);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(A);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(A);
+ vcl_size_t A_size1 = viennacl::traits::size1(A);
+ vcl_size_t A_size2 = viennacl::traits::size2(A);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A);
+
+ vcl_size_t B_start1 = viennacl::traits::start1(B);
+ vcl_size_t B_start2 = viennacl::traits::start2(B);
+ vcl_size_t B_inc1 = viennacl::traits::stride1(B);
+ vcl_size_t B_inc2 = viennacl::traits::stride2(B);
+ vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B);
+ vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B);
+
+ vcl_size_t C_start1 = viennacl::traits::start1(C);
+ vcl_size_t C_start2 = viennacl::traits::start2(C);
+ vcl_size_t C_inc1 = viennacl::traits::stride1(C);
+ vcl_size_t C_inc2 = viennacl::traits::stride2(C);
+ vcl_size_t C_size1 = viennacl::traits::size1(C);
+ vcl_size_t C_size2 = viennacl::traits::size2(C);
+ vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(C);
+ vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(C);
+
+ if (!trans_A && !trans_B)
+ {
+ if (A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ }
+ else if (!trans_A && trans_B)
+ {
+ if (A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ }
+ else if (trans_A && !trans_B)
+ {
+ if (A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ }
+ else if (trans_A && trans_B)
+ {
+ if (A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (A.row_major() && !B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && B.row_major() && !C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else if (!A.row_major() && !B.row_major() && C.row_major())
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ else
+ {
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2);
+ detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2);
+ detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2);
+
+ detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta));
+ }
+ }
+}
+
+
+
+
+//
+///////////////////////// miscellaneous operations /////////////////////////////////
+//
+
+
+/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update
+*
+* Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2);
+*
+* @param mat1 The matrix to be updated
+* @param alpha The scaling factor (either a viennacl::scalar<>, float, or double)
+* @param reciprocal_alpha Use 1/alpha instead of alpha
+* @param flip_sign_alpha Use -alpha instead of alpha
+* @param vec1 The first vector
+* @param vec2 The second vector
+*/
+template<typename NumericT, typename ScalarT>
+void scaled_rank_1_update(matrix_base<NumericT> & mat1,
+ ScalarT const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ const vector_base<NumericT> & vec1,
+ const vector_base<NumericT> & vec2)
+{
+ typedef NumericT value_type;
+
+ value_type * data_A = detail::extract_raw_pointer<value_type>(mat1);
+ value_type const * data_v1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_v2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ vcl_size_t A_start1 = viennacl::traits::start1(mat1);
+ vcl_size_t A_start2 = viennacl::traits::start2(mat1);
+ vcl_size_t A_inc1 = viennacl::traits::stride1(mat1);
+ vcl_size_t A_inc2 = viennacl::traits::stride2(mat1);
+ vcl_size_t A_size1 = viennacl::traits::size1(mat1);
+ vcl_size_t A_size2 = viennacl::traits::size2(mat1);
+ vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1);
+ vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+
+ if (mat1.row_major())
+ {
+ if(reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ {
+ value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] / data_alpha;
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2];
+ }
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long row = 0; row < static_cast<long>(A_size1); ++row)
+ {
+ value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] * data_alpha;
+ for (vcl_size_t col = 0; col < A_size2; ++col)
+ data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2];
+ }
+ }
+ }
+ else
+ {
+ if(reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col) //run through matrix sequentially
+ {
+ value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] / data_alpha;
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += data_v1[row * inc1 + start1] * value_v2;
+ }
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE)
+#endif
+ for (long col = 0; col < static_cast<long>(A_size2); ++col) //run through matrix sequentially
+ {
+ value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] * data_alpha;
+ for (vcl_size_t row = 0; row < A_size1; ++row)
+ data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_intern
<TRUNCATED>