You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:25 UTC
[28/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
new file mode 100644
index 0000000..b7eaeb4
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
@@ -0,0 +1,3252 @@
+#ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/vector_operations.hpp
+ @brief Implementations of vector operations using a plain single-threaded execution on CPU
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_kernel(DestNumericT * dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest,
+ SrcNumericT const * src, unsigned int start_src, unsigned int inc_src)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size_dest;
+ i += gridDim.x * blockDim.x)
+ dest[i*inc_dest+start_dest] = src[i*inc_src+start_src];
+}
+
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+ convert_kernel<<<128, 128>>>(viennacl::cuda_arg(dest),
+ static_cast<unsigned int>(viennacl::traits::start(dest)),
+ static_cast<unsigned int>(viennacl::traits::stride(dest)),
+ static_cast<unsigned int>(viennacl::traits::size(dest)),
+
+ viennacl::cuda_arg(src),
+ static_cast<unsigned int>(viennacl::traits::start(src)),
+ static_cast<unsigned int>(viennacl::traits::stride(src)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("convert_kernel");
+}
+
+
+//////////////////////// av /////////////////////////////
+
+// gpu scalar
+template<typename NumericT>
+__global__ void av_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ if (options2 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+ }
+}
+
+// cpu scalar
+template<typename NumericT>
+__global__ void av_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ if (options2 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+ }
+}
+
+
+
+template<typename NumericT, typename ScalarType1>
+void av(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarType1>::value)
+ temporary_alpha = alpha;
+
+ av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+
+///////////////////// avbv //////////////////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<ScalarT2>::value)
+ temporary_beta = beta;
+
+
+ avbv_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(vec3),
+ static_cast<unsigned int>(viennacl::traits::start(vec3)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
+}
+
+
+////////////////////////// avbv_v //////////////////////////////////////
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ const NumericT * fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = *fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ const NumericT * fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = *fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT fac2,
+ unsigned int options2,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT fac3,
+ unsigned int options3,
+ const NumericT * vec3,
+ unsigned int start3,
+ unsigned int inc3)
+{
+ NumericT alpha = fac2;
+ if (options2 & (1 << 0))
+ alpha = -alpha;
+
+ NumericT beta = fac3;
+ if (options3 & (1 << 0))
+ beta = -beta;
+
+ if (options2 & (1 << 1))
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+ else
+ {
+ if (options3 & (1 << 1))
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+ }
+ else
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+ }
+ }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef NumericT value_type;
+
+ unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = -data_alpha;
+ if (reciprocal_alpha)
+ data_alpha = static_cast<value_type>(1) / data_alpha;
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+ unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
+
+ value_type temporary_beta = 0;
+ if (viennacl::is_cpu_scalar<ScalarT2>::value)
+ temporary_beta = beta;
+
+
+ avbv_v_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+ options_alpha,
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+ options_beta,
+ viennacl::cuda_arg(vec3),
+ static_cast<unsigned int>(viennacl::traits::start(vec3)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+}
+
+
+//////////////////////////
+
+template<typename NumericT>
+__global__ void vector_assign_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ unsigned int internal_size1,
+
+ NumericT alpha)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = (i < size1) ? alpha : 0;
+}
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1 The vector to which the value should be assigned
+* @param alpha The value to be assigned
+* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template<typename NumericT, typename ScalarT1>
+void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool up_to_internal_size = false)
+{
+ typedef NumericT value_type;
+
+ value_type temporary_alpha = 0;
+ if (viennacl::is_cpu_scalar<ScalarT1>::value)
+ temporary_alpha = alpha;
+
+ unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
+
+ vector_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ size,
+ static_cast<unsigned int>(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+ viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_assign_kernel");
+}
+
+//////////////////////////
+
+template<typename NumericT>
+__global__ void vector_swap_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2)
+{
+ NumericT tmp;
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ tmp = vec2[i*inc2+start2];
+ vec2[i*inc2+start2] = vec1[i*inc1+start1];
+ vec1[i*inc1+start1] = tmp;
+ }
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1 The first vector (or -range, or -slice)
+* @param vec2 The second vector (or -range, or -slice)
+*/
+template<typename NumericT>
+void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+{
+ vector_swap_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
+}
+
+///////////////////////// Binary Elementwise operations /////////////
+
+template<typename NumericT>
+__global__ void element_op_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT const * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT const * vec3,
+ unsigned int start3,
+ unsigned int inc3,
+
+ unsigned int op_type
+ )
+{
+ if (op_type == 2)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
+ }
+ }
+ else if (op_type == 1)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+ }
+ }
+ else if (op_type == 0)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+ }
+ }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_kernel(NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+
+ NumericT const * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+
+ NumericT const * vec3,
+ unsigned int start3,
+ unsigned int inc3,
+
+ unsigned int op_type
+ )
+{
+ if (op_type == 1)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+ }
+ }
+ else if (op_type == 0)
+ {
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+ i < size1;
+ i += gridDim.x * blockDim.x)
+ {
+ vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+ }
+ }
+}
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax)
+*
+* @param vec1 The result vector (or -range, or -slice)
+* @param proxy The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ element_op_int_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+template<typename OpT>
+void element_op(vector_base<float> & vec1,
+ vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OpT> > const & proxy)
+{
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+template<typename OpT>
+void element_op(vector_base<double> & vec1,
+ vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OpT> > const & proxy)
+{
+ unsigned int op_type = 2; //0: product, 1: division, 2: power
+ if (viennacl::is_division<OpT>::value)
+ op_type = 1;
+ else if (viennacl::is_product<OpT>::value)
+ op_type = 0;
+
+ element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+ viennacl::cuda_arg(proxy.rhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+ op_type
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+///////////////////////// Unary Elementwise operations /////////////
+
+// Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
+// Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
+
+// acos
+template<typename NumericT>
+__global__ void vec_element_acos_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_acos> > const & proxy)
+{
+ typedef NumericT value_type;
+
+ vec_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
+}
+
+// asin
+template<typename NumericT>
+__global__ void vec_element_asin_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_asin> > const & proxy)
+{
+ vec_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void vec_element_atan_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_atan> > const & proxy)
+{
+ vec_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void vec_element_ceil_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
+{
+ vec_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void vec_element_cos_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cos> > const & proxy)
+{
+ vec_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void vec_element_cosh_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
+{
+ vec_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void vec_element_exp_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_exp> > const & proxy)
+{
+ vec_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void vec_element_fabs_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
+{
+ vec_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
+}
+
+// abs
+template<typename NumericT>
+__global__ void vec_element_abs_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_abs> > const & proxy)
+{
+ vec_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
+}
+
+
+
+// floor
+template<typename NumericT>
+__global__ void vec_element_floor_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_floor> > const & proxy)
+{
+ vec_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
+}
+
+
+// log
+template<typename NumericT>
+__global__ void vec_element_log_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log> > const & proxy)
+{
+ vec_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void vec_element_log10_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log10> > const & proxy)
+{
+ vec_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void vec_element_sin_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sin> > const & proxy)
+{
+ vec_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void vec_element_sinh_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
+{
+ vec_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void vec_element_sqrt_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
+{
+ vec_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void vec_element_tan_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tan> > const & proxy)
+{
+ vec_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void vec_element_tanh_kernel(
+ NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+ NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+ for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+ vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tanh> > const & proxy)
+{
+ vec_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(proxy.lhs()),
+ static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+ static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
+}
+
+
+
+///////////////////////// Norms and inner product ///////////////////
+
+
+template<typename NumericT>
+__global__ void inner_prod_kernel(const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ const NumericT * vec2,
+ unsigned int start2,
+ unsigned int inc2,
+ unsigned int size2,
+ NumericT * group_buffer)
+{
+ __shared__ NumericT tmp_buffer[128];
+ unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
+ unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
+
+ unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
+ - ( blockIdx.x * size1) / (gridDim.x);
+
+
+ NumericT tmp = 0;
+ for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
+ tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
+ tmp_buffer[threadIdx.x] = tmp;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
+ }
+
+ if (threadIdx.x == 0)
+ group_buffer[blockIdx.x] = tmp_buffer[0];
+
+}
+
+
+
+// sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+template<typename NumericT>
+__global__ void vector_sum_kernel_floats(
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
+ NumericT * result)
+{
+ __shared__ NumericT tmp_buffer[128];
+ NumericT thread_sum = 0;
+ for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+ {
+ if (option > 0)
+ thread_sum += vec1[i*inc1+start1];
+ else
+ thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
+ }
+
+ tmp_buffer[threadIdx.x] = thread_sum;
+
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ if (option > 0)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+ else
+ tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
+ }
+ }
+
+ if (threadIdx.x == 0)
+ {
+ if (option == 2)
+ *result = sqrt(tmp_buffer[0]);
+ else
+ *result = tmp_buffer[0];
+ }
+}
+
+template<typename NumericT>
+__global__ void vector_sum_kernel_integers(
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ unsigned int option, //0: use max, 1: just sum
+ NumericT * result)
+{
+ __shared__ NumericT tmp_buffer[128];
+ NumericT thread_sum = 0;
+ for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+ {
+ if (option > 0)
+ thread_sum += vec1[i*inc1+start1];
+ else
+ thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
+ }
+
+ tmp_buffer[threadIdx.x] = thread_sum;
+
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ if (option > 0)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+ else
+ tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+ }
+ }
+
+ if (threadIdx.x == 0)
+ *result = tmp_buffer[0];
+}
+
+template<typename NumericT>
+__global__ void vector_sum_kernel_unsigned_integers(
+ const NumericT * vec1,
+ unsigned int start1,
+ unsigned int inc1,
+ unsigned int size1,
+ unsigned int option, //0: use max, 1: just sum
+ NumericT * result)
+{
+ __shared__ NumericT tmp_buffer[128];
+ NumericT thread_sum = 0;
+ for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+ {
+ if (option > 0)
+ thread_sum += vec1[i*inc1+start1];
+ else
+ thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
+ }
+
+ tmp_buffer[threadIdx.x] = thread_sum;
+
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ {
+ if (option > 0)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+ else
+ tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+ }
+ }
+
+ if (threadIdx.x == 0)
+ *result = tmp_buffer[0];
+}
+
+namespace detail
+{
+ /** \cond */
+ struct vector_sum_kernel_launcher_integers
+ {
+ template<typename NumericT, typename ScalarT>
+ static void apply(vector_base<NumericT> const & temp,
+ unsigned int option,
+ ScalarT & result)
+ {
+ typedef NumericT value_type;
+ vector_sum_kernel_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
+ static_cast<unsigned int>(viennacl::traits::start(temp)),
+ static_cast<unsigned int>(viennacl::traits::stride(temp)),
+ static_cast<unsigned int>(viennacl::traits::size(temp)),
+ static_cast<unsigned int>(option),
+ viennacl::cuda_arg(result) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+ }
+ };
+
+ struct vector_sum_kernel_launcher_unsigned_integers
+ {
+ template<typename NumericT, typename ScalarT>
+ static void apply(vector_base<NumericT> const & temp,
+ unsigned int option,
+ ScalarT & result)
+ {
+ typedef NumericT value_type;
+ vector_sum_kernel_unsigned_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
+ static_cast<unsigned int>(viennacl::traits::start(temp)),
+ static_cast<unsigned int>(viennacl::traits::stride(temp)),
+ static_cast<unsigned int>(viennacl::traits::size(temp)),
+ static_cast<unsigned int>(option),
+ viennacl::cuda_arg(result) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+ }
+ };
+
+ struct vector_sum_kernel_launcher_floats
+ {
+ template<typename NumericT, typename ScalarT>
+ static void apply(vector_base<NumericT> const & temp,
+ unsigned int option,
+ ScalarT & result)
+ {
+ typedef NumericT value_type;
+ vector_sum_kernel_floats<<<1, 128>>>(viennacl::cuda_arg(temp),
+ static_cast<unsigned int>(viennacl::traits::start(temp)),
+ static_cast<unsigned int>(viennacl::traits::stride(temp)),
+ static_cast<unsigned int>(viennacl::traits::size(temp)),
+ static_cast<unsigned int>(option),
+ viennacl::cuda_arg(result) );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+ }
+ };
+
+ template<typename NumericT>
+ struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<unsigned char> : public vector_sum_kernel_launcher_unsigned_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<unsigned short> : public vector_sum_kernel_launcher_unsigned_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<unsigned int> : public vector_sum_kernel_launcher_unsigned_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<unsigned long> : public vector_sum_kernel_launcher_unsigned_integers {};
+
+ template<>
+ struct vector_sum_kernel_launcher<float> : public vector_sum_kernel_launcher_floats {};
+
+ template<>
+ struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
+
+ /** \endcond */
+}
+
+
+//implementation of inner product:
+//namespace {
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template<typename NumericT, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> const & vec2,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ static const unsigned int work_groups = 128;
+ static viennacl::vector<value_type> temp(work_groups);
+
+ inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+ static_cast<unsigned int>(viennacl::traits::size(vec2)),
+ viennacl::cuda_arg(temp)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+ detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
+}
+
+
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the host)
+*/
+template<typename NumericT>
+void inner_prod_cpu(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> const & vec2,
+ NumericT & result)
+{
+ typedef NumericT value_type;
+
+ const unsigned int work_groups = 128;
+ viennacl::vector<value_type> temp(work_groups);
+
+ inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+ static_cast<unsigned int>(viennacl::traits::start(vec1)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+ static_cast<unsigned int>(viennacl::traits::size(vec1)),
+ viennacl::cuda_arg(vec2),
+ static_cast<unsigned int>(viennacl::traits::start(vec2)),
+ static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+ static_cast<unsigned int>(viennacl::traits::size(vec2)),
+ viennacl::cuda_arg(temp)
+ );
+ VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+ // Now copy partial results from GPU back to CPU and run reduction there:
+ std::vector<value_type> temp_cpu(work_groups);
+ viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+ result = 0;
+ for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+ result += *it;
+}
+
+///////////////////////////////////
+
+#define VIENNACL_MDOT_WORKGROUP_SIZE 128
+#define VIENNACL_MDOT_WORKGROUP_NUM 128
+// M = 2:
+template<typename NumericT>
+__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
+ const NumericT *y0, unsigned int start0, unsigned int stride0,
+ const NumericT *y1, unsigned int start1, unsigned int stride1,
+ NumericT *group_results)
+{
+ __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
+ unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+ unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+ unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
+
+ NumericT entry_x = 0;
+ NumericT group_sum0 = 0;
+ NumericT group_sum1 = 0;
+ for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+ entry_x = x[i * stridex + startx]; // load only once from global memory!
+ group_sum0 += entry_x * y0[i * stride0 + start0];
+ group_sum1 += entry_x * y1[i * stride1 + start1];
+ }
+ tmp_buffer[threadIdx.x] = group_sum0;
+ tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
+ tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+ }
+ }
+
+ // write result of group to group_results
+ if (threadIdx.x == 0) {
+ group_results[blockIdx.x] = tmp_buffer[0];
+ group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
+ }
+}
+
+// M = 3:
+template<typename NumericT>
+__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
+ const NumericT *y0, unsigned int start0, unsigned int stride0,
+ const NumericT *y1, unsigned int start1, unsigned int stride1,
+ const NumericT *y2, unsigned int start2, unsigned int stride2,
+ NumericT *group_results)
+{
+ __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
+ unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+ unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+ unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+ NumericT entry_x = 0;
+ NumericT group_sum0 = 0;
+ NumericT group_sum1 = 0;
+ NumericT group_sum2 = 0;
+ for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+ entry_x = x[i * stridex + startx]; // load only once from global memory!
+ group_sum0 += entry_x * y0[i * stride0 + start0];
+ group_sum1 += entry_x * y1[i * stride1 + start1];
+ group_sum2 += entry_x * y2[i * stride2 + start2];
+ }
+ tmp_buffer[threadIdx.x] = group_sum0;
+ tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
+ tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+ }
+ }
+
+ // write result of group to group_results
+ if (threadIdx.x == 0) {
+ group_results[blockIdx.x ] = tmp_buffer[0];
+ group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
+ group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+ }
+}
+
+// M = 4:
+template<typename NumericT>
+__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
+ const NumericT *y0, unsigned int start0, unsigned int stride0,
+ const NumericT *y1, unsigned int start1, unsigned int stride1,
+ const NumericT *y2, unsigned int start2, unsigned int stride2,
+ const NumericT *y3, unsigned int start3, unsigned int stride3,
+ NumericT *group_results)
+{
+ __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
+ unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+ unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+ unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+ NumericT entry_x = 0;
+ NumericT group_sum0 = 0;
+ NumericT group_sum1 = 0;
+ NumericT group_sum2 = 0;
+ NumericT group_sum3 = 0;
+ for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+ entry_x = x[i * stridex + startx]; // load only once from global memory!
+ group_sum0 += entry_x * y0[i * stride0 + start0];
+ group_sum1 += entry_x * y1[i * stride1 + start1];
+ group_sum2 += entry_x * y2[i * stride2 + start2];
+ group_sum3 += entry_x * y3[i * stride3 + start3];
+ }
+ tmp_buffer[threadIdx.x] = group_sum0;
+ tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+ tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
+ tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+ tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+ }
+ }
+
+ // write result of group to group_results
+ if (threadIdx.x == 0) {
+ group_results[blockIdx.x ] = tmp_buffer[0];
+ group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
+ group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+ group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+ }
+}
+
+// M = 8:
+template<typename NumericT>
+__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
+ const NumericT *y0, unsigned int start0, unsigned int stride0,
+ const NumericT *y1, unsigned int start1, unsigned int stride1,
+ const NumericT *y2, unsigned int start2, unsigned int stride2,
+ const NumericT *y3, unsigned int start3, unsigned int stride3,
+ const NumericT *y4, unsigned int start4, unsigned int stride4,
+ const NumericT *y5, unsigned int start5, unsigned int stride5,
+ const NumericT *y6, unsigned int start6, unsigned int stride6,
+ const NumericT *y7, unsigned int start7, unsigned int stride7,
+ NumericT *group_results)
+{
+ __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
+ unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+ unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+ unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+ NumericT entry_x = 0;
+ NumericT group_sum0 = 0;
+ NumericT group_sum1 = 0;
+ NumericT group_sum2 = 0;
+ NumericT group_sum3 = 0;
+ NumericT group_sum4 = 0;
+ NumericT group_sum5 = 0;
+ NumericT group_sum6 = 0;
+ NumericT group_sum7 = 0;
+ for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+ entry_x = x[i * stridex + startx]; // load only once from global memory!
+ group_sum0 += entry_x * y0[i * stride0 + start0];
+ group_sum1 += entry_x * y1[i * stride1 + start1];
+ group_sum2 += entry_x * y2[i * stride2 + start2];
+ group_sum3 += entry_x * y3[i * stride3 + start3];
+ group_sum4 += entry_x * y4[i * stride4 + start4];
+ group_sum5 += entry_x * y5[i * stride5 + start5];
+ group_sum6 += entry_x * y6[i * stride6 + start6];
+ group_sum7 += entry_x * y7[i * stride7 + start7];
+ }
+ tmp_buffer[threadIdx.x] = group_sum0;
+ tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+ tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+ tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
+ tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
+ tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
+ tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
+
+ // parallel reduction
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+ __syncthreads();
+ if (threadIdx.x < stride) {
+ tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
+ tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+ tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+ tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+ tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
+ tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
+ tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
+ tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
+ }
+ }
+
+ // write result of group to group_results
+ if (threadIdx.x == 0) {
+ group_results[blockIdx.x ] = tmp_buffer[0];
+ group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
+ group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+ group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+ group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
+ group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
+ group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
+ group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
+ }
+}
+
+// sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+template<typename NumericT>
+__global__ void vector_multi_sum_kernel(
+ NumericT const * vec1,
+ NumericT * result,
+ unsigned int start_result,
+ unsigned int inc_result)
+{
+ __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
+
+ tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
+
+ for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+ {
+ __syncthreads();
+ if (threadIdx.x < stride)
+ tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+ }
+
+ if (threadIdx.x == 0)
+ result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
+}
+
+template<typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+ vector_tuple<NumericT> const & vec_tuple,
+ vector_base<NumericT> & result)
+{
+ typedef NumericT value_type;
+
+ static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM);
+
+ vcl_size_t current_index = 0;
+ while (vec_tuple.const_size() > current_index)
+ {
+ switch (vec_tuple.const_size() - current_index)
+ {
+ case 7:
+ case 6:
+ case 5:
+ case 4:
+ {
+ vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
+ vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+ vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+ vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+
+ inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
+ VIENNACL_MDOT_WORKGROUP_SIZE>>>( viennacl::cuda_arg(x),
+ static_cast<unsigned int>(viennacl::traits::start(x)),
+ static_cast<unsigned int>(viennacl::traits::stride(x)),
+ static_cast<unsigned int>(viennacl::traits::size(x)),
+ viennacl::cuda_arg(y0),
+ static_cast<unsigned int>(viennacl::traits::start(y0)),
+ static_cast<unsigned int>(viennacl::traits::stride(y0)),
+ viennacl::cuda_arg(y1),
+ static_cast<unsigned int>(viennacl::traits::start(y1)),
+ static_cast<unsigned int>(viennacl::traits::stride(y1)),
+ viennacl::cuda_arg(y2),
+ static_cast<unsigned int>(vienna
<TRUNCATED>