You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/10 16:52:39 UTC

[34/51] [partial] mahout git commit: Revert "(nojira) add native-viennaCL module to codebase. closes apache/mahout#241"

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
deleted file mode 100644
index 44684ce..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_col.hpp
+++ /dev/null
@@ -1,1847 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
-#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file  viennacl/linalg/cuda/matrix_operations_col.hpp
-    @brief Implementations of column-major dense matrix related operations, including matrix-vector products, using CUDA.
-*/
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-template<typename DestNumericT, typename SrcNumericT>
-__global__ void convert_col_kernel(DestNumericT * A,
-                                  unsigned int A_start1, unsigned int A_start2,
-                                  unsigned int A_inc1,   unsigned int A_inc2,
-                                  unsigned int A_size1,  unsigned int A_size2,
-                                  unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-                                  const SrcNumericT * B,
-                                  unsigned int B_start1, unsigned int B_start2,
-                                  unsigned int B_inc1,   unsigned int B_inc2,
-                                  unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1];
-}
-
-//
-// am
-//
-
-// alpha on CPU
-template<typename NumericT>
-__global__ void am_col_kernel(NumericT * A,
-                              unsigned int A_start1, unsigned int A_start2,
-                              unsigned int A_inc1,   unsigned int A_inc2,
-                              unsigned int A_size1,  unsigned int A_size2,
-                              unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-                              NumericT fac2,
-                              unsigned int options2,
-                              const NumericT * B,
-                              unsigned int B_start1, unsigned int B_start2,
-                              unsigned int B_inc1,   unsigned int B_inc2,
-                              unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
-  }
-  else
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
-  }
-}
-
-// alpha on GPU
-template<typename NumericT>
-__global__ void am_col_kernel(NumericT * A,
-                              unsigned int A_start1, unsigned int A_start2,
-                              unsigned int A_inc1,   unsigned int A_inc2,
-                              unsigned int A_size1,  unsigned int A_size2,
-                              unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-                              const NumericT * fac2,
-                              unsigned int options2,
-                              const NumericT * B,
-                              unsigned int B_start1, unsigned int B_start2,
-                              unsigned int B_inc1,   unsigned int B_inc2,
-                              unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
-  }
-  else
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
-  }
-}
-
-
-//
-// ambm
-//
-
-// alpha and beta on CPU
-template<typename NumericT>
-__global__ void ambm_col_kernel(NumericT * A,
-                                unsigned int A_start1, unsigned int A_start2,
-                                unsigned int A_inc1,   unsigned int A_inc2,
-                                unsigned int A_size1,  unsigned int A_size2,
-                                unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-                                NumericT fac2,
-                                unsigned int options2,
-                                const NumericT * B,
-                                unsigned int B_start1, unsigned int B_start2,
-                                unsigned int B_inc1,   unsigned int B_inc2,
-                                unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-                                NumericT fac3,
-                                unsigned int options3,
-                                const NumericT * C,
-                                unsigned int C_start1, unsigned int C_start2,
-                                unsigned int C_inc1,   unsigned int C_inc2,
-                                unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-}
-
-
-// alpha on CPU, beta on GPU
-template<typename NumericT>
-__global__ void ambm_col_kernel(NumericT * A,
-                                unsigned int A_start1, unsigned int A_start2,
-                                unsigned int A_inc1,   unsigned int A_inc2,
-                                unsigned int A_size1,  unsigned int A_size2,
-                                unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-                                NumericT fac2,
-                                unsigned int options2,
-                                const NumericT * B,
-                                unsigned int B_start1, unsigned int B_start2,
-                                unsigned int B_inc1,   unsigned int B_inc2,
-                                unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-                                const NumericT * fac3,
-                                unsigned int options3,
-                                const NumericT * C,
-                                unsigned int C_start1, unsigned int C_start2,
-                                unsigned int C_inc1,   unsigned int C_inc2,
-                                unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-}
-
-// alpha on GPU, beta on CPU
-template<typename NumericT>
-__global__ void ambm_col_kernel(NumericT * A,
-                                unsigned int A_start1, unsigned int A_start2,
-                                unsigned int A_inc1,   unsigned int A_inc2,
-                                unsigned int A_size1,  unsigned int A_size2,
-                                unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-                                const NumericT * fac2,
-                                unsigned int options2,
-                                const NumericT * B,
-                                unsigned int B_start1, unsigned int B_start2,
-                                unsigned int B_inc1,   unsigned int B_inc2,
-                                unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-                                NumericT fac3,
-                                unsigned int options3,
-                                const NumericT * C,
-                                unsigned int C_start1, unsigned int C_start2,
-                                unsigned int C_inc1,   unsigned int C_inc2,
-                                unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-}
-
-
-// alpha and beta on GPU
-template<typename NumericT>
-__global__ void ambm_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-}
-
-
-//
-// ambm_m
-//
-
-// alpha and beta on CPU
-template<typename NumericT>
-__global__ void ambm_m_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          NumericT fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-       += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-       += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-       += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-       += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-}
-
-
-// alpha on CPU, beta on GPU
-template<typename NumericT>
-__global__ void ambm_m_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-}
-
-// alpha on GPU, beta on CPU
-template<typename NumericT>
-__global__ void ambm_m_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          NumericT fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-}
-
-
-// alpha and beta on GPU
-template<typename NumericT>
-__global__ void ambm_m_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
-    }
-    else
-    {
-      for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-        for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-          A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-        = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
-        + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
-    }
-  }
-}
-
-
-
-//
-// assignments
-//
-
-template<typename NumericT>
-__global__ void matrix_col_assign_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-          NumericT alpha)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
-}
-
-
-template<typename NumericT>
-__global__ void matrix_col_diagonal_assign_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-          NumericT alpha)
-{
-  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-
-  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
-    A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha;
-}
-
-//
-// binary element-wise operations
-//
-
-template<typename NumericT>
-__global__ void element_op_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2,
-
-          unsigned int op_type) //0: product, 1: division, 2: pow
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (op_type == 2)
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-      = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
-            C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
-  }
-  else if (op_type == 1)
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-      = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
-      / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
-  }
-  else if (op_type == 0)
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-      = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
-      * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
-  }
-}
-
-template<typename NumericT>
-__global__ void element_op_int_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2,
-
-          unsigned int op_type) //0: product, 1: division, 2: pow
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-
-  if (op_type == 1)
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-      = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
-      / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
-  }
-  else if (op_type == 0)
-  {
-    for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-      for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
-      = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
-      * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
-  }
-}
-
-
-//
-// unary element-wise operations
-//
-
-// abs
-template<typename NumericT>
-__global__ void matrix_col_element_abs_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// acos
-template<typename NumericT>
-__global__ void matrix_col_element_acos_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// asin
-template<typename NumericT>
-__global__ void matrix_col_element_asin_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// atan
-template<typename NumericT>
-__global__ void matrix_col_element_atan_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// ceil
-template<typename NumericT>
-__global__ void matrix_col_element_ceil_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// cos
-template<typename NumericT>
-__global__ void matrix_col_element_cos_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// cosh
-template<typename NumericT>
-__global__ void matrix_col_element_cosh_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// exp
-template<typename NumericT>
-__global__ void matrix_col_element_exp_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// fabs
-template<typename NumericT>
-__global__ void matrix_col_element_fabs_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// floor
-template<typename NumericT>
-__global__ void matrix_col_element_floor_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// log
-template<typename NumericT>
-__global__ void matrix_col_element_log_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// log10
-template<typename NumericT>
-__global__ void matrix_col_element_log10_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// sin
-template<typename NumericT>
-__global__ void matrix_col_element_sin_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// sinh
-template<typename NumericT>
-__global__ void matrix_col_element_sinh_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// sqrt
-template<typename NumericT>
-__global__ void matrix_col_element_sqrt_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// tan
-template<typename NumericT>
-__global__ void matrix_col_element_tan_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-// tanh
-template<typename NumericT>
-__global__ void matrix_col_element_tanh_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
-    for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
-}
-
-
-
-//
-// matrix-vector product
-//
-
-template<typename NumericT>
-__global__ void vec_mul_col_kernel(
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          NumericT * result,
-          unsigned int result_start,
-          unsigned int result_inc,
-          unsigned int result_size)
-{
-
-  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x)
-  {
-    NumericT dot_prod = 0;
-    for (unsigned int col = 0; col < A_col_size; ++col)
-      dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
-    result[row * result_inc + result_start] = dot_prod;
-  }
-}
-
-
-template<typename NumericT>
-__global__ void trans_vec_mul_col_kernel(
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          NumericT * result,
-          unsigned int result_start,
-          unsigned int result_inc,
-          unsigned int result_size)
-{
-  __shared__ NumericT work[128];
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int lid = threadIdx.x;
-
-  for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x)
-  {
-    NumericT dot_prod = 0;
-    for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
-      dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
-    work[lid] = dot_prod;
-
-    for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
-      __syncthreads();
-      if (lid < stride)
-        work[lid] += work[lid+stride];
-    }
-
-    if (lid == 0)
-      result[row * result_inc + result_start] = work[0];
-  }
-}
-
-
-//
-// matrix-matrix products
-//
-
-
-
-
-//
-// scaled rank-1-update
-//
-
-// alpha on CPU
-template<typename NumericT>
-__global__ void scaled_rank1_update_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT val,
-          unsigned int options2,
-
-          const NumericT * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-
-          const NumericT * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2)
-{
-  NumericT alpha = val;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-  if (options2 & (1 << 1))
-    alpha = NumericT(1) / alpha;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-  {
-    NumericT tmp = alpha * vec1[row * inc1 + start1];
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
-  }
-}
-
-
-// alpha on GPU
-template<typename NumericT>
-__global__ void scaled_rank1_update_col_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * val,
-          unsigned int options2,
-
-          const NumericT * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-
-          const NumericT * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2)
-{
-  NumericT alpha = *val;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-  if (options2 & (1 << 1))
-    alpha = NumericT(1) / alpha;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-  {
-    NumericT tmp = alpha * vec1[row * inc1 + start1];
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
-  }
-}
-
-
-template <typename T>
-__global__ void bidiag_pack_row_major_kernel(
-            T * A,
-            T * D,
-            T * S,
-            unsigned int size1,
-            unsigned int size2,
-            unsigned int stride)
-{
-  unsigned int size = min(size1, size2);
-  if(blockIdx.x * blockDim.x + threadIdx.x == 0)
-    S[0] = 0;
-
-  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-           i < size;
-           i += gridDim.x * blockDim.x)
-    {
-      D[i] = A[i*stride + i];
-      S[i+1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0;
-    }
-}
-
-template <typename T>
-__global__ void bidiag_pack_column_major_kernel(
-            T * A,
-            T * D,
-            T * S,
-            unsigned int size1,
-            unsigned int size2,
-            unsigned int stride)
-{
-  unsigned int size = min(size1, size2);
-  if(blockIdx.x * blockDim.x + threadIdx.x == 0)
-    S[0] = 0;
-
-  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-           i < size;
-           i += gridDim.x * blockDim.x)
-    {
-      D[i] = A[i*stride + i];
-      S[i+1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0;
-    }
-}
-
-
-
-template<typename T>
-__global__ void copy_col_row_major_kernel(
-        T * A,
-        T * V,
-        unsigned int row_start,
-        unsigned int col_start,
-        unsigned int size,
-        unsigned int stride)
-{
-    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int sz = gridDim.x * blockDim.x;
-
-    for(unsigned int i = row_start + x; i < size; i += sz)
-    {
-        V[i - row_start] = A[i * stride + col_start];
-    }
-}
-
-template<typename T>
-__global__ void copy_col_column_major_kernel(
-        T * A,
-        T * V,
-        unsigned int row_start,
-        unsigned int col_start,
-        unsigned int size,
-        unsigned int stride)
-{
-    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int sz = gridDim.x * blockDim.x;
-
-    for(unsigned int i = row_start + x; i < size; i += sz)
-    {
-        V[i - row_start] = A[i + col_start * stride];
-    }
-}
-
-template<typename T>
-__global__ void copy_row_row_major_kernel(
-        T * A,
-        T * V,
-        unsigned int row_start,
-        unsigned int col_start,
-        unsigned int size,
-        unsigned int stride)
-{
-    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int sz = gridDim.x * blockDim.x;
-
-    for(unsigned int i = col_start + x; i < size; i += sz)
-    {
-        V[i - col_start] = A[row_start * stride + i];
-    }
-
-}
-
-template<typename T>
-__global__ void copy_row_column_major_kernel(
-        T * A,
-        T * V,
-        unsigned int row_start,
-        unsigned int col_start,
-        unsigned int size,
-        unsigned int stride)
-{
-    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-    unsigned int sz = gridDim.x * blockDim.x;
-
-    for(unsigned int i = col_start + x; i < size; i += sz)
-    {
-        V[i - col_start] = A[row_start + i * stride];
-    }
-
-}
-
-
-
-template<typename T>
-__global__ void house_update_A_left_row_major_kernel(
-        T * A,
-        T * V,        //householder vector
-        unsigned int row_start,
-        unsigned int col_start,
-        unsigned int size1,
-        unsigned int size2,
-        unsigned int stride)
-{
-    T ss = 0;
-
-    for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
-        i < size2;
-        i += gridDim.x * blockDim.x)
-    {
-        ss = 0;
-        for(unsigned int j = row_start; j < size1; j++)
-            ss = ss +(V[j] * A[j * stride + i]);
-
-        for(unsigned int j = row_start; j < size1; j++)
-            A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
-    }
-}
-
-template<typename T>
-__global__ void house_update_A_left_column_major_kernel(
-        T * A,
-        T * V,        //householder vector
-        unsigned int row_start,
-        unsigned int col_start,
-        unsigned int size1,
-        unsigned int size2,
-        unsigned int stride)
-{
-    T ss = 0;
-
-    for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
-        i < size2;
-        i += gridDim.x * blockDim.x)
-    {
-        ss = 0;
-        for(unsigned int j = row_start; j < size1; j++)
-            ss = ss +(V[j] * A[j + i * stride]);
-
-        for(unsigned int j = row_start; j < size1; j++)
-            A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss);
-    }
-}
-
-
-
-template<typename T>
-__global__ void house_update_A_right_row_major_kernel(
-        T * A,
-        T * V,  //householder vector
-        unsigned int row_start,
-        unsigned int col_start,
-        unsigned int size1,
-        unsigned int size2,
-        unsigned int stride)
-{
-    __shared__ T sums[128];
-    T ss = 0;
-
-    for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
-    {
-        ss = 0;
-        for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
-            ss = ss + (V[j] * A[i * stride + j]);
-        sums[threadIdx.x] = ss;
-
-        __syncthreads();
-        col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
-        __syncthreads();
-
-        T sum_Av = sums[0];
-
-        for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
-            A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
-    }
-}
-
-template<typename T>
-__global__ void house_update_A_right_column_major_kernel(
-        T * A,
-        T * V,  //householder vector
-        unsigned int row_start,
-        unsigned int col_start,
-        unsigned int size1,
-        unsigned int size2,
-        unsigned int stride)
-{
-    __shared__ T sums[128];
-    T ss = 0;
-
-    for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
-    {
-        ss = 0;
-        for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
-            ss = ss + (V[j] * A[i + j * stride]);
-        sums[threadIdx.x] = ss;
-
-        __syncthreads();
-        col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
-        __syncthreads();
-
-        T sum_Av = sums[0];
-
-        for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
-            A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av);
-    }
-}
-
-
-
-template<typename T>
-__device__ void col_reduce_lcl_array(
-        T * sums,
-        unsigned int th_Idx,
-        unsigned int bl_Dim)
-{
-    unsigned int step = bl_Dim >> 1;
-
-    while(step > 0)
-    {
-        if(th_Idx < step)
-            sums[th_Idx] += sums[th_Idx + step];
-        step >>= 1;
-        __syncthreads();
-    }
-}
-
-
-template <typename T>
-__global__ void house_update_QL_row_major_kernel(
-        T * QL,
-        T * V,
-        unsigned int size1,
-        unsigned int strideQ)
-{
-  __shared__ T sums[128];
-  T ss = 0;
-  for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
-  {
-    ss = 0;
-    for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
-      ss = ss + (V[j] * QL[i * strideQ + j]);
-    sums[threadIdx.x] = ss;
-
-    __syncthreads();
-    col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
-    __syncthreads();
-
-    T sum_Qv = sums[0];
-
-    for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
-      QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
-  }
-}
-
-template <typename T>
-__global__ void house_update_QL_column_major_kernel(
-        T * QL,
-        T * V,
-        unsigned int size1,
-        unsigned int strideQ)
-{
-  __shared__ T sums[128];
-  T ss = 0;
-  for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
-  {
-    ss = 0;
-    for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
-      ss = ss + (V[j] * QL[i + j * strideQ]);
-    sums[threadIdx.x] = ss;
-
-    __syncthreads();
-    col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
-    __syncthreads();
-
-    T sum_Qv = sums[0];
-
-    for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
-      QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv);
-  }
-}
-
-
-template <typename T>
-__global__ void givens_next_row_major_kernel(
-        T * matr,
-        T * cs,
-        T * ss,
-        unsigned int size,
-        unsigned int stride,
-        unsigned int start_i,
-        unsigned int end_i)
-{
-    unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
-    __shared__ T cs_lcl[256];
-    __shared__ T ss_lcl[256];
-
-    T x = (j < size) ? matr[(end_i + 1) + j * stride] : 0;
-
-    unsigned int elems_num = end_i - start_i + 1;
-    unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
-
-    for(unsigned int block_id = 0; block_id < block_num; block_id++)
-    {
-        unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
-
-        if(threadIdx.x < to)
-        {
-            cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
-            ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
-        }
-        __syncthreads();
-        if(j < size)
-        {
-            for(unsigned int ind = 0; ind < to; ind++)
-            {
-                unsigned int i = end_i - (ind + block_id * blockDim.x);
-                T z = matr[i + j * stride];
-                T cs_val = cs_lcl[ind];
-                T ss_val = ss_lcl[ind];
-                matr[(i + 1) + j * stride] = x * cs_val + z * ss_val;
-                x = -x * ss_val + z * cs_val;
-            }
-        }
-        __syncthreads();
-     }
-     if(j < size)
-       matr[(start_i) + j * stride] = x;
-}
-
-template <typename T>
-__global__ void givens_next_column_major_kernel(
-        T * matr,
-        T * cs,
-        T * ss,
-        unsigned int size,
-        unsigned int stride,
-        unsigned int start_i,
-        unsigned int end_i)
-{
-    unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
-    __shared__ T cs_lcl[256];
-    __shared__ T ss_lcl[256];
-
-    T x = (j < size) ? matr[(end_i + 1) *stride + j] : 0;
-
-    unsigned int elems_num = end_i - start_i + 1;
-    unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
-
-    for(unsigned int block_id = 0; block_id < block_num; block_id++)
-    {
-        unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
-
-        if(threadIdx.x < to)
-        {
-            cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
-            ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
-        }
-        __syncthreads();
-        if(j < size)
-        {
-            for(unsigned int ind = 0; ind < to; ind++)
-            {
-                unsigned int i = end_i - (ind + block_id * blockDim.x);
-                T z = matr[i *stride + j];
-                T cs_val = cs_lcl[ind];
-                T ss_val = ss_lcl[ind];
-                matr[(i + 1) * stride + j] = x * cs_val + z * ss_val;
-                x = -x * ss_val + z * cs_val;
-            }
-        }
-        __syncthreads();
-     }
-     if(j < size)
-       matr[(start_i) * stride + j] = x;
-}
-
-
-
-
-
-} // namespace cuda
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif