You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:25 UTC

[28/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
new file mode 100644
index 0000000..b7eaeb4
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
@@ -0,0 +1,3252 @@
+#ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/cuda/vector_operations.hpp
+    @brief Implementations of vector operations using a plain single-threaded execution on CPU
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_kernel(DestNumericT      * dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest,
+                               SrcNumericT const * src,  unsigned int start_src,  unsigned int inc_src)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                    i < size_dest;
+                    i += gridDim.x * blockDim.x)
+    dest[i*inc_dest+start_dest] = src[i*inc_src+start_src];
+}
+
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+  convert_kernel<<<128, 128>>>(viennacl::cuda_arg(dest),
+                              static_cast<unsigned int>(viennacl::traits::start(dest)),
+                              static_cast<unsigned int>(viennacl::traits::stride(dest)),
+                              static_cast<unsigned int>(viennacl::traits::size(dest)),
+
+                              viennacl::cuda_arg(src),
+                              static_cast<unsigned int>(viennacl::traits::start(src)),
+                              static_cast<unsigned int>(viennacl::traits::stride(src)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("convert_kernel");
+}
+
+
+//////////////////////// av /////////////////////////////
+
+// gpu scalar
+template<typename NumericT>
+__global__ void av_kernel(NumericT * vec1,
+                          unsigned int start1,
+                          unsigned int inc1,
+                          unsigned int size1,
+
+                          const NumericT * fac2,
+                          unsigned int options2,
+                          const NumericT * vec2,
+                          unsigned int start2,
+                          unsigned int inc2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+  }
+  else
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+  }
+}
+
+// cpu scalar
+template<typename NumericT>
+__global__ void av_kernel(NumericT * vec1,
+                          unsigned int start1,
+                          unsigned int inc1,
+                          unsigned int size1,
+
+                          NumericT fac2,
+                          unsigned int options2,
+                          const NumericT * vec2,
+                          unsigned int start2,
+                          unsigned int inc2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
+  }
+  else
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
+  }
+}
+
+
+
+template<typename NumericT, typename ScalarType1>
+void av(vector_base<NumericT> & vec1,
+        vector_base<NumericT> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarType1>::value)
+    temporary_alpha = alpha;
+
+  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                          static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                          static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                          viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                          options_alpha,
+                          viennacl::cuda_arg(vec2),
+                          static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                          static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
+}
+
+
+///////////////////// avbv //////////////////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+                            unsigned int start1,
+                            unsigned int inc1,
+                            unsigned int size1,
+
+                            const NumericT * fac2,
+                            unsigned int options2,
+                            const NumericT * vec2,
+                            unsigned int start2,
+                            unsigned int inc2,
+
+                            const NumericT * fac3,
+                            unsigned int options3,
+                            const NumericT * vec3,
+                            unsigned int start3,
+                            unsigned int inc3)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+                            unsigned int start1,
+                            unsigned int inc1,
+                            unsigned int size1,
+
+                            NumericT fac2,
+                            unsigned int options2,
+                            const NumericT * vec2,
+                            unsigned int start2,
+                            unsigned int inc2,
+
+                            const NumericT * fac3,
+                            unsigned int options3,
+                            const NumericT * vec3,
+                            unsigned int start3,
+                            unsigned int inc3)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+                            unsigned int start1,
+                            unsigned int inc1,
+                            unsigned int size1,
+
+                            const NumericT * fac2,
+                            unsigned int options2,
+                            const NumericT * vec2,
+                            unsigned int start2,
+                            unsigned int inc2,
+
+                            NumericT fac3,
+                            unsigned int options3,
+                            const NumericT * vec3,
+                            unsigned int start3,
+                            unsigned int inc3)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void avbv_kernel(NumericT * vec1,
+                            unsigned int start1,
+                            unsigned int inc1,
+                            unsigned int size1,
+
+                            NumericT fac2,
+                            unsigned int options2,
+                            const NumericT * vec2,
+                            unsigned int start2,
+                            unsigned int inc2,
+
+                            NumericT fac3,
+                            unsigned int options3,
+                            const NumericT * vec3,
+                            unsigned int start3,
+                            unsigned int inc3)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT> & vec1,
+          vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+          vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<ScalarT2>::value)
+    temporary_beta = beta;
+
+
+  avbv_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                            static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                            static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                            static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                            viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                            options_alpha,
+                            viennacl::cuda_arg(vec2),
+                            static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                            static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+                            viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                            options_beta,
+                            viennacl::cuda_arg(vec3),
+                            static_cast<unsigned int>(viennacl::traits::start(vec3)),
+                            static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
+}
+
+
+////////////////////////// avbv_v //////////////////////////////////////
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+                              unsigned int start1,
+                              unsigned int inc1,
+                              unsigned int size1,
+
+                              const NumericT * fac2,
+                              unsigned int options2,
+                              const NumericT * vec2,
+                              unsigned int start2,
+                              unsigned int inc2,
+
+                              const NumericT * fac3,
+                              unsigned int options3,
+                              const NumericT * vec3,
+                              unsigned int start3,
+                              unsigned int inc3)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+                              unsigned int start1,
+                              unsigned int inc1,
+                              unsigned int size1,
+
+                              NumericT fac2,
+                              unsigned int options2,
+                              const NumericT * vec2,
+                              unsigned int start2,
+                              unsigned int inc2,
+
+                              const NumericT * fac3,
+                              unsigned int options3,
+                              const NumericT * vec3,
+                              unsigned int start3,
+                              unsigned int inc3)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+                              unsigned int start1,
+                              unsigned int inc1,
+                              unsigned int size1,
+
+                              const NumericT * fac2,
+                              unsigned int options2,
+                              const NumericT * vec2,
+                              unsigned int start2,
+                              unsigned int inc2,
+
+                              NumericT fac3,
+                              unsigned int options3,
+                              const NumericT * vec3,
+                              unsigned int start3,
+                              unsigned int inc3)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void avbv_v_kernel(NumericT * vec1,
+                              unsigned int start1,
+                              unsigned int inc1,
+                              unsigned int size1,
+
+                              NumericT fac2,
+                              unsigned int options2,
+                              const NumericT * vec2,
+                              unsigned int start2,
+                              unsigned int inc2,
+
+                              NumericT fac3,
+                              unsigned int options3,
+                              const NumericT * vec3,
+                              unsigned int start3,
+                              unsigned int inc3)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
+    }
+    else
+    {
+      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                        i < size1;
+                        i += gridDim.x * blockDim.x)
+        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
+    }
+  }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT> & vec1,
+            vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+            vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef NumericT        value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = -data_alpha;
+  if (reciprocal_alpha)
+    data_alpha = static_cast<value_type>(1) / data_alpha;
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+  unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<ScalarT2>::value)
+    temporary_beta = beta;
+
+
+  avbv_v_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                              viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                              options_alpha,
+                              viennacl::cuda_arg(vec2),
+                              static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                              static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+
+                              viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                              options_beta,
+                              viennacl::cuda_arg(vec3),
+                              static_cast<unsigned int>(viennacl::traits::start(vec3)),
+                              static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
+}
+
+
+//////////////////////////
+
+template<typename NumericT>
+__global__ void vector_assign_kernel(NumericT * vec1,
+                                     unsigned int start1,
+                                     unsigned int inc1,
+                                     unsigned int size1,
+                                     unsigned int internal_size1,
+
+                                     NumericT alpha)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                    i < size1;
+                    i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] =  (i < size1) ? alpha : 0;
+}
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1   The vector to which the value should be assigned
+* @param alpha  The value to be assigned
+* @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template<typename NumericT, typename ScalarT1>
+void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool up_to_internal_size = false)
+{
+  typedef NumericT        value_type;
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<ScalarT1>::value)
+    temporary_alpha = alpha;
+
+  unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
+
+  vector_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                     static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                     static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                     size,
+                                     static_cast<unsigned int>(vec1.internal_size()),  //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+                                     viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_assign_kernel");
+}
+
+//////////////////////////
+
+template<typename NumericT>
+__global__ void vector_swap_kernel(NumericT * vec1,
+                                   unsigned int start1,
+                                   unsigned int inc1,
+                                   unsigned int size1,
+
+                                   NumericT * vec2,
+                                   unsigned int start2,
+                                   unsigned int inc2)
+{
+  NumericT tmp;
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                    i < size1;
+                    i += gridDim.x * blockDim.x)
+  {
+    tmp = vec2[i*inc2+start2];
+    vec2[i*inc2+start2] = vec1[i*inc1+start1];
+    vec1[i*inc1+start1] = tmp;
+  }
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1   The first vector (or -range, or -slice)
+* @param vec2   The second vector (or -range, or -slice)
+*/
+template<typename NumericT>
+void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+{
+  vector_swap_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                   static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                   static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                   static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                   viennacl::cuda_arg(vec2),
+                                   static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                   static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
+}
+
+///////////////////////// Binary Elementwise operations /////////////
+
+template<typename NumericT>
+__global__ void element_op_kernel(NumericT * vec1,
+                                   unsigned int start1,
+                                   unsigned int inc1,
+                                   unsigned int size1,
+
+                                   NumericT const * vec2,
+                                   unsigned int start2,
+                                   unsigned int inc2,
+
+                                   NumericT const * vec3,
+                                   unsigned int start3,
+                                   unsigned int inc3,
+
+                                   unsigned int op_type
+                                 )
+{
+  if (op_type == 2)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
+    }
+  }
+  else if (op_type == 1)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+    }
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+    }
+  }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_kernel(NumericT * vec1,
+                                   unsigned int start1,
+                                   unsigned int inc1,
+                                   unsigned int size1,
+
+                                   NumericT const * vec2,
+                                   unsigned int start2,
+                                   unsigned int inc2,
+
+                                   NumericT const * vec3,
+                                   unsigned int start3,
+                                   unsigned int inc3,
+
+                                   unsigned int op_type
+                                 )
+{
+  if (op_type == 1)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
+    }
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+                      i < size1;
+                      i += gridDim.x * blockDim.x)
+    {
+      vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
+    }
+  }
+}
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+*
+* @param vec1   The result vector (or -range, or -slice)
+* @param proxy  The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  element_op_int_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                  viennacl::cuda_arg(proxy.lhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                  viennacl::cuda_arg(proxy.rhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                  op_type
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+template<typename OpT>
+void element_op(vector_base<float> & vec1,
+                vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OpT> > const & proxy)
+{
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                  viennacl::cuda_arg(proxy.lhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                  viennacl::cuda_arg(proxy.rhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                  op_type
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+template<typename OpT>
+void element_op(vector_base<double> & vec1,
+                vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OpT> > const & proxy)
+{
+  unsigned int op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OpT>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OpT>::value)
+    op_type = 0;
+
+  element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+
+                                  viennacl::cuda_arg(proxy.lhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
+
+                                  viennacl::cuda_arg(proxy.rhs()),
+                                  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
+                                  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
+
+                                  op_type
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
+}
+
+///////////////////////// Unary Elementwise operations /////////////
+
+// Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
+// Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
+
+// acos
+template<typename NumericT>
+__global__ void vec_element_acos_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_acos> > const & proxy)
+{
+  typedef NumericT        value_type;
+
+  vec_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
+}
+
+// asin
+template<typename NumericT>
+__global__ void vec_element_asin_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_asin> > const & proxy)
+{
+  vec_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void vec_element_atan_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_atan> > const & proxy)
+{
+  vec_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void vec_element_ceil_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
+{
+  vec_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void vec_element_cos_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cos> > const & proxy)
+{
+  vec_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void vec_element_cosh_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
+{
+  vec_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void vec_element_exp_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_exp> > const & proxy)
+{
+  vec_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void vec_element_fabs_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
+{
+  vec_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
+}
+
+// abs
+template<typename NumericT>
+__global__ void vec_element_abs_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_abs> > const & proxy)
+{
+  vec_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                       static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                       static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                       static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                       viennacl::cuda_arg(proxy.lhs()),
+                                       static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                       static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                      );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
+}
+
+
+
+// floor
+template<typename NumericT>
+__global__ void vec_element_floor_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_floor> > const & proxy)
+{
+  vec_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
+}
+
+
+// log
+template<typename NumericT>
+__global__ void vec_element_log_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log> > const & proxy)
+{
+  vec_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void vec_element_log10_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log10> > const & proxy)
+{
+  vec_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void vec_element_sin_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sin> > const & proxy)
+{
+  vec_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void vec_element_sinh_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
+{
+  vec_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void vec_element_sqrt_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
+{
+  vec_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void vec_element_tan_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tan> > const & proxy)
+{
+  vec_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void vec_element_tanh_kernel(
+    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
+    NumericT const * vec2, unsigned int start2, unsigned int inc2)
+{
+  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
+    vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
+}
+
+template<typename NumericT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tanh> > const & proxy)
+{
+  vec_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                        viennacl::cuda_arg(proxy.lhs()),
+                                        static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
+                                        static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
+}
+
+
+
+///////////////////////// Norms and inner product ///////////////////
+
+
+template<typename NumericT>
+__global__ void inner_prod_kernel(const NumericT * vec1,
+                                  unsigned int start1,
+                                  unsigned int inc1,
+                                  unsigned int size1,
+                                  const NumericT * vec2,
+                                  unsigned int start2,
+                                  unsigned int inc2,
+                                  unsigned int size2,
+                                  NumericT * group_buffer)
+{
+  __shared__ NumericT tmp_buffer[128];
+  unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
+  unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
+
+  unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
+                               - (  blockIdx.x * size1) / (gridDim.x);
+
+
+  NumericT tmp = 0;
+  for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
+    tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
+  tmp_buffer[threadIdx.x] = tmp;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
+  }
+
+  if (threadIdx.x == 0)
+    group_buffer[blockIdx.x] = tmp_buffer[0];
+
+}
+
+
+
+// sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+template<typename NumericT>
+__global__ void vector_sum_kernel_floats(
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+          unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
+          NumericT * result)
+{
+  __shared__ NumericT tmp_buffer[128];
+  NumericT thread_sum = 0;
+  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+  {
+    if (option > 0)
+      thread_sum += vec1[i*inc1+start1];
+    else
+      thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
+  }
+
+  tmp_buffer[threadIdx.x] = thread_sum;
+
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      if (option > 0)
+        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+      else
+        tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
+    }
+  }
+
+  if (threadIdx.x == 0)
+  {
+    if (option == 2)
+      *result = sqrt(tmp_buffer[0]);
+    else
+      *result = tmp_buffer[0];
+  }
+}
+
+template<typename NumericT>
+__global__ void vector_sum_kernel_integers(
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+          unsigned int option, //0: use max, 1: just sum
+          NumericT * result)
+{
+  __shared__ NumericT tmp_buffer[128];
+  NumericT thread_sum = 0;
+  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+  {
+    if (option > 0)
+      thread_sum += vec1[i*inc1+start1];
+    else
+      thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
+  }
+
+  tmp_buffer[threadIdx.x] = thread_sum;
+
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      if (option > 0)
+        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+      else
+        tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+    }
+  }
+
+  if (threadIdx.x == 0)
+    *result = tmp_buffer[0];
+}
+
+template<typename NumericT>
+__global__ void vector_sum_kernel_unsigned_integers(
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+          unsigned int option, //0: use max, 1: just sum
+          NumericT * result)
+{
+  __shared__ NumericT tmp_buffer[128];
+  NumericT thread_sum = 0;
+  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
+  {
+    if (option > 0)
+      thread_sum += vec1[i*inc1+start1];
+    else
+      thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
+  }
+
+  tmp_buffer[threadIdx.x] = thread_sum;
+
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+    {
+      if (option > 0)
+        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+      else
+        tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
+    }
+  }
+
+  if (threadIdx.x == 0)
+    *result = tmp_buffer[0];
+}
+
+namespace detail
+{
+  /** \cond */
+  struct vector_sum_kernel_launcher_integers
+  {
+    template<typename NumericT, typename ScalarT>
+    static void apply(vector_base<NumericT> const & temp,
+                      unsigned int option,
+                      ScalarT & result)
+    {
+      typedef NumericT        value_type;
+      vector_sum_kernel_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
+                                            static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                            static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                            static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                            static_cast<unsigned int>(option),
+                                            viennacl::cuda_arg(result) );
+      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+    }
+  };
+
+  struct vector_sum_kernel_launcher_unsigned_integers
+  {
+    template<typename NumericT, typename ScalarT>
+    static void apply(vector_base<NumericT> const & temp,
+                      unsigned int option,
+                      ScalarT & result)
+    {
+      typedef NumericT        value_type;
+      vector_sum_kernel_unsigned_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
+                                                      static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                                      static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                                      static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                                      static_cast<unsigned int>(option),
+                                                      viennacl::cuda_arg(result) );
+      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+    }
+  };
+
+  struct vector_sum_kernel_launcher_floats
+  {
+    template<typename NumericT, typename ScalarT>
+    static void apply(vector_base<NumericT> const & temp,
+                      unsigned int option,
+                      ScalarT & result)
+    {
+      typedef NumericT        value_type;
+      vector_sum_kernel_floats<<<1, 128>>>(viennacl::cuda_arg(temp),
+                                            static_cast<unsigned int>(viennacl::traits::start(temp)),
+                                            static_cast<unsigned int>(viennacl::traits::stride(temp)),
+                                            static_cast<unsigned int>(viennacl::traits::size(temp)),
+                                            static_cast<unsigned int>(option),
+                                            viennacl::cuda_arg(result) );
+      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
+    }
+  };
+
+  template<typename NumericT>
+  struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<unsigned char>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<unsigned short>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<unsigned int>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<unsigned long>  : public vector_sum_kernel_launcher_unsigned_integers {};
+
+  template<>
+  struct vector_sum_kernel_launcher<float>  : public vector_sum_kernel_launcher_floats {};
+
+  template<>
+  struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
+
+  /** \endcond */
+}
+
+
+//implementation of inner product:
+//namespace {
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template<typename NumericT, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT> const & vec1,
+                     vector_base<NumericT> const & vec2,
+                     ScalarT & result)
+{
+  typedef NumericT        value_type;
+
+  static const unsigned int work_groups = 128;
+  static viennacl::vector<value_type> temp(work_groups);
+
+  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                  viennacl::cuda_arg(vec2),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec2)),
+                                  viennacl::cuda_arg(temp)
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
+}
+
+
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the host)
+*/
+template<typename NumericT>
+void inner_prod_cpu(vector_base<NumericT> const & vec1,
+                    vector_base<NumericT> const & vec2,
+                    NumericT & result)
+{
+  typedef NumericT        value_type;
+
+  const unsigned int work_groups = 128;
+  viennacl::vector<value_type> temp(work_groups);
+
+  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec1)),
+                                  viennacl::cuda_arg(vec2),
+                                  static_cast<unsigned int>(viennacl::traits::start(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
+                                  static_cast<unsigned int>(viennacl::traits::size(vec2)),
+                                  viennacl::cuda_arg(temp)
+                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
+
+  // Now copy partial results from GPU back to CPU and run reduction there:
+  std::vector<value_type> temp_cpu(work_groups);
+  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
+
+  result = 0;
+  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
+    result += *it;
+}
+
+///////////////////////////////////
+
+#define VIENNACL_MDOT_WORKGROUP_SIZE  128
+#define VIENNACL_MDOT_WORKGROUP_NUM   128
+// M = 2:
+template<typename NumericT>
+__global__ void inner_prod_2_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                    const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                    const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                    NumericT *group_results)
+{
+  __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
+  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
+
+  NumericT entry_x    = 0;
+  NumericT group_sum0 = 0;
+  NumericT group_sum1 = 0;
+  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+    entry_x     = x[i * stridex + startx];   // load only once from global memory!
+    group_sum0 += entry_x * y0[i * stride0 + start0];
+    group_sum1 += entry_x * y1[i * stride1 + start1];
+  }
+  tmp_buffer[threadIdx.x]              = group_sum0;
+  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      tmp_buffer[threadIdx.x             ] += tmp_buffer[threadIdx.x+stride             ];
+      tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
+    }
+  }
+
+  // write result of group to group_results
+  if (threadIdx.x == 0) {
+    group_results[blockIdx.x]             = tmp_buffer[0];
+    group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
+  }
+}
+
+// M = 3:
+template<typename NumericT>
+__global__ void inner_prod_3_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                    const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                    const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                    const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                    NumericT *group_results)
+{
+  __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
+  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+  NumericT entry_x    = 0;
+  NumericT group_sum0 = 0;
+  NumericT group_sum1 = 0;
+  NumericT group_sum2 = 0;
+  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+    entry_x     = x[i * stridex + startx];   // load only once from global memory!
+    group_sum0 += entry_x * y0[i * stride0 + start0];
+    group_sum1 += entry_x * y1[i * stride1 + start1];
+    group_sum2 += entry_x * y2[i * stride2 + start2];
+  }
+  tmp_buffer[threadIdx.x]                  = group_sum0;
+  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+      tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+      tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+    }
+  }
+
+  // write result of group to group_results
+  if (threadIdx.x == 0) {
+    group_results[blockIdx.x                ] = tmp_buffer[0];
+    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+  }
+}
+
+// M = 4:
+template<typename NumericT>
+__global__ void inner_prod_4_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                    const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                    const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                    const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                    const NumericT *y3, unsigned int start3, unsigned int stride3,
+                                    NumericT *group_results)
+{
+  __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
+  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+  NumericT entry_x    = 0;
+  NumericT group_sum0 = 0;
+  NumericT group_sum1 = 0;
+  NumericT group_sum2 = 0;
+  NumericT group_sum3 = 0;
+  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+    entry_x     = x[i * stridex + startx];   // load only once from global memory!
+    group_sum0 += entry_x * y0[i * stride0 + start0];
+    group_sum1 += entry_x * y1[i * stride1 + start1];
+    group_sum2 += entry_x * y2[i * stride2 + start2];
+    group_sum3 += entry_x * y3[i * stride3 + start3];
+  }
+  tmp_buffer[threadIdx.x]                  = group_sum0;
+  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+      tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+      tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+      tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+    }
+  }
+
+  // write result of group to group_results
+  if (threadIdx.x == 0) {
+    group_results[blockIdx.x                ] = tmp_buffer[0];
+    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+    group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+  }
+}
+
+// M = 8:
+template<typename NumericT>
+__global__ void inner_prod_8_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
+                                    const NumericT *y0, unsigned int start0, unsigned int stride0,
+                                    const NumericT *y1, unsigned int start1, unsigned int stride1,
+                                    const NumericT *y2, unsigned int start2, unsigned int stride2,
+                                    const NumericT *y3, unsigned int start3, unsigned int stride3,
+                                    const NumericT *y4, unsigned int start4, unsigned int stride4,
+                                    const NumericT *y5, unsigned int start5, unsigned int stride5,
+                                    const NumericT *y6, unsigned int start6, unsigned int stride6,
+                                    const NumericT *y7, unsigned int start7, unsigned int stride7,
+                                    NumericT *group_results)
+{
+  __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
+  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
+  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
+  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
+
+  NumericT entry_x    = 0;
+  NumericT group_sum0 = 0;
+  NumericT group_sum1 = 0;
+  NumericT group_sum2 = 0;
+  NumericT group_sum3 = 0;
+  NumericT group_sum4 = 0;
+  NumericT group_sum5 = 0;
+  NumericT group_sum6 = 0;
+  NumericT group_sum7 = 0;
+  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
+    entry_x     = x[i * stridex + startx];   // load only once from global memory!
+    group_sum0 += entry_x * y0[i * stride0 + start0];
+    group_sum1 += entry_x * y1[i * stride1 + start1];
+    group_sum2 += entry_x * y2[i * stride2 + start2];
+    group_sum3 += entry_x * y3[i * stride3 + start3];
+    group_sum4 += entry_x * y4[i * stride4 + start4];
+    group_sum5 += entry_x * y5[i * stride5 + start5];
+    group_sum6 += entry_x * y6[i * stride6 + start6];
+    group_sum7 += entry_x * y7[i * stride7 + start7];
+  }
+  tmp_buffer[threadIdx.x]                  = group_sum0;
+  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
+  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
+  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
+  tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
+  tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
+  tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
+  tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
+
+  // parallel reduction
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
+    __syncthreads();
+    if (threadIdx.x < stride) {
+      tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
+      tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
+      tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
+      tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
+      tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
+      tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
+      tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
+      tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
+    }
+  }
+
+  // write result of group to group_results
+  if (threadIdx.x == 0) {
+    group_results[blockIdx.x                ] = tmp_buffer[0];
+    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
+    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
+    group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
+    group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
+    group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
+    group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
+    group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
+  }
+}
+
+// sums the array 'vec1' and writes to result. Makes use of a single work-group only.
+template<typename NumericT>
+__global__ void vector_multi_sum_kernel(
+          NumericT const * vec1,
+          NumericT * result,
+          unsigned int start_result,
+          unsigned int inc_result)
+{
+  __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
+
+  tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
+
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
+  }
+
+  if (threadIdx.x == 0)
+    result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
+}
+
+template<typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+                     vector_tuple<NumericT> const & vec_tuple,
+                     vector_base<NumericT> & result)
+{
+  typedef NumericT        value_type;
+
+  static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM);
+
+  vcl_size_t current_index = 0;
+  while (vec_tuple.const_size() > current_index)
+  {
+    switch (vec_tuple.const_size() - current_index)
+    {
+      case 7:
+      case 6:
+      case 5:
+      case 4:
+      {
+        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
+        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
+        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
+        vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
+
+        inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
+                              VIENNACL_MDOT_WORKGROUP_SIZE>>>( viennacl::cuda_arg(x),
+                                                               static_cast<unsigned int>(viennacl::traits::start(x)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride(x)),
+                                                               static_cast<unsigned int>(viennacl::traits::size(x)),
+                                                               viennacl::cuda_arg(y0),
+                                                               static_cast<unsigned int>(viennacl::traits::start(y0)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride(y0)),
+                                                               viennacl::cuda_arg(y1),
+                                                               static_cast<unsigned int>(viennacl::traits::start(y1)),
+                                                               static_cast<unsigned int>(viennacl::traits::stride(y1)),
+                                                               viennacl::cuda_arg(y2),
+                                                               static_cast<unsigned int>(vienna

<TRUNCATED>