You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/10 16:52:36 UTC
[31/51] [partial] mahout git commit: Revert "(nojira) add
native-viennaCL module to codebase. closes apache/mahout#241"
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
deleted file mode 100644
index 51d99e1..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/sparse_matrix_operations.hpp
+++ /dev/null
@@ -1,2809 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
-
-/* =========================================================================
- Copyright (c) 2010-2016, Institute for Microelectronics,
- Institute for Analysis and Scientific Computing,
- TU Wien.
- Portions of this software are copyright by UChicago Argonne, LLC.
-
- -----------------
- ViennaCL - The Vienna Computing Library
- -----------------
-
- Project Head: Karl Rupp rupp@iue.tuwien.ac.at
-
- (A list of authors and contributors can be found in the manual)
-
- License: MIT (X11), see file LICENSE in the base directory
-============================================================================= */
-
-/** @file viennacl/linalg/cuda/sparse_matrix_operations.hpp
- @brief Implementations of operations using sparse matrices using CUDA
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/cuda/common.hpp"
-#include "viennacl/linalg/cuda/vector_operations.hpp"
-
-#include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
-
-//#ifdef VIENNACL_WITH_SPGEMM_RMERGE
- #include "viennacl/linalg/cuda/spgemm_rmerge.hpp"
-//#else
-// #include "viennacl/linalg/cuda/spgemm.hpp"
-//#endif
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-//
-// Compressed matrix
-//
-
-namespace detail
-{
-
- template<typename NumericT>
- __global__ void csr_row_info_extractor_kernel(
- const unsigned int * row_indices,
- const unsigned int * column_indices,
- const NumericT * elements,
- NumericT * result,
- unsigned int size,
- unsigned int option)
- {
- for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
- row < size;
- row += gridDim.x * blockDim.x)
- {
- NumericT value = 0;
- unsigned int row_end = row_indices[row+1];
-
- switch (option)
- {
- case 0: //inf-norm
- for (unsigned int i = row_indices[row]; i < row_end; ++i)
- value = max(value, fabs(elements[i]));
- break;
-
- case 1: //1-norm
- for (unsigned int i = row_indices[row]; i < row_end; ++i)
- value += fabs(elements[i]);
- break;
-
- case 2: //2-norm
- for (unsigned int i = row_indices[row]; i < row_end; ++i)
- value += elements[i] * elements[i];
- value = sqrt(value);
- break;
-
- case 3: //diagonal entry
- for (unsigned int i = row_indices[row]; i < row_end; ++i)
- {
- if (column_indices[i] == row)
- {
- value = elements[i];
- break;
- }
- }
- break;
-
- default:
- break;
- }
- result[row] = value;
- }
- }
-
-
- template<typename NumericT, unsigned int AligmentV>
- void row_info(compressed_matrix<NumericT, AligmentV> const & mat,
- vector_base<NumericT> & vec,
- viennacl::linalg::detail::row_info_types info_selector)
- {
- csr_row_info_extractor_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.size1()),
- static_cast<unsigned int>(info_selector)
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
- }
-
- struct spmv_pure
- {
- template<typename NumericT>
- __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = Ax; }
- };
-
- struct spmv_alpha_beta
- {
- template<typename NumericT>
- __device__ static void apply(NumericT & result, NumericT alpha, NumericT Ax, NumericT beta) { result = alpha * Ax + ((beta != 0) ? beta * result : 0); }
- };
-
-} //namespace detail
-
-
-
-template<unsigned int SubWarpSizeV, typename AlphaBetaHandlerT, typename NumericT>
-__global__ void compressed_matrix_vec_mul_kernel(
- const unsigned int * row_indices,
- const unsigned int * column_indices,
- const NumericT * elements,
- const NumericT * x,
- unsigned int start_x,
- unsigned int inc_x,
- NumericT alpha,
- NumericT * result,
- unsigned int start_result,
- unsigned int inc_result,
- unsigned int size_result,
- NumericT beta)
-{
- __shared__ NumericT shared_elements[512];
-
- const unsigned int id_in_row = threadIdx.x % SubWarpSizeV;
- const unsigned int block_increment = blockDim.x * ((size_result - 1) / (gridDim.x * blockDim.x) + 1);
- const unsigned int block_start = blockIdx.x * block_increment;
- const unsigned int block_stop = min(block_start + block_increment, size_result);
-
- for (unsigned int row = block_start + threadIdx.x / SubWarpSizeV;
- row < block_stop;
- row += blockDim.x / SubWarpSizeV)
- {
- NumericT dot_prod = NumericT(0);
- unsigned int row_end = row_indices[row+1];
- for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += SubWarpSizeV)
- dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
-
- shared_elements[threadIdx.x] = dot_prod;
- if (1 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 1];
- if (2 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 2];
- if (4 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 4];
- if (8 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 8];
- if (16 < SubWarpSizeV) shared_elements[threadIdx.x] += shared_elements[threadIdx.x ^ 16];
-
- if (id_in_row == 0)
- AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, shared_elements[threadIdx.x], beta);
- }
-}
-
-
-template<typename AlphaBetaHandlerT, typename NumericT>
-__global__ void compressed_matrix_vec_mul_adaptive_kernel(
- const unsigned int * row_indices,
- const unsigned int * column_indices,
- const unsigned int * row_blocks,
- const NumericT * elements,
- unsigned int num_blocks,
- const NumericT * x,
- unsigned int start_x,
- unsigned int inc_x,
- NumericT alpha,
- NumericT * result,
- unsigned int start_result,
- unsigned int inc_result,
- unsigned int size_result,
- NumericT beta)
-{
- __shared__ NumericT shared_elements[1024];
-
- for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
- {
- unsigned int row_start = row_blocks[block_id];
- unsigned int row_stop = row_blocks[block_id + 1];
- unsigned int element_start = row_indices[row_start];
- unsigned int element_stop = row_indices[row_stop];
- unsigned int rows_to_process = row_stop - row_start;
-
- if (rows_to_process > 1) // CSR stream with one thread per row
- {
- // load to shared buffer:
- for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
- shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
-
- __syncthreads();
-
- // use one thread per row to sum:
- for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
- {
- NumericT dot_prod = 0;
- unsigned int thread_row_start = row_indices[row] - element_start;
- unsigned int thread_row_stop = row_indices[row + 1] - element_start;
- for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
- dot_prod += shared_elements[i];
- AlphaBetaHandlerT::apply(result[row * inc_result + start_result], alpha, dot_prod, beta);
- }
- }
- // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
- else // CSR vector for a single row
- {
- // load and sum to shared buffer:
- shared_elements[threadIdx.x] = 0;
- for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
- shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
-
- // reduction to obtain final result
- for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
- {
- __syncthreads();
- if (threadIdx.x < stride)
- shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
- }
-
- if (threadIdx.x == 0)
- AlphaBetaHandlerT::apply(result[row_start * inc_result + start_result], alpha, shared_elements[0], beta);
- }
-
- __syncthreads(); // avoid race conditions
- }
-}
-
-
-
-
-/** @brief Carries out matrix-vector multiplication with a compressed_matrix
-*
-* Implementation of the convenience expression result = prod(mat, vec);
-*
-* @param mat The matrix
-* @param vec The vector
-* @param result The result vector
-*/
-template<class NumericT, unsigned int AlignmentV>
-void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & mat,
- const viennacl::vector_base<NumericT> & vec,
- NumericT alpha,
- viennacl::vector_base<NumericT> & result,
- NumericT beta)
-{
- static bool first = true;
- static bool is_maxwell = false;
-
- // check whether the CUDA device is from the Maxwell family.
- // Only run once, because the query to the backend takes about the same time as a kernel launch (~15us), thus being too expensive to query each time.
- //
- // Note: This might result in non-optimal kernels being selected if multiple Maxwell- and non-Maxwell GPUs are available in the system and devices are switched at runtime.
- // However, this situation is certainly rare, hence the the benefits of this singleton outweigh the disadvantages encountered in such a corner case.
- if (first)
- {
- cudaDeviceProp prop;
- int device_index = 0;
-
- cudaError_t err_flag = cudaGetDevice(&device_index);
- if (err_flag == cudaSuccess)
- {
- err_flag = cudaGetDeviceProperties(&prop, device_index);
- if (err_flag == cudaSuccess && prop.major >= 5)
- is_maxwell = true;
- }
- first = false;
- }
-
- if (is_maxwell && double(mat.nnz()) / double(mat.size1()) > 6.4) // less than 10% of threads expected to idle
- {
- if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
- compressed_matrix_vec_mul_kernel<8, detail::spmv_alpha_beta, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
- viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(vec.start()),
- static_cast<unsigned int>(vec.stride()),
- alpha,
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(result.start()),
- static_cast<unsigned int>(result.stride()),
- static_cast<unsigned int>(result.size()),
- beta
- );
- else
- compressed_matrix_vec_mul_kernel<8, detail::spmv_pure, NumericT><<<512, 256>>>( // experience on a GTX 750 Ti suggests that 8 is a substantially better choice here
- viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(vec.start()),
- static_cast<unsigned int>(vec.stride()),
- alpha,
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(result.start()),
- static_cast<unsigned int>(result.stride()),
- static_cast<unsigned int>(result.size()),
- beta
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
- }
- else if (!is_maxwell && double(mat.nnz()) / double(mat.size1()) > 12.0) // less than 25% of threads expected to idle
- {
- if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
- compressed_matrix_vec_mul_kernel<16, detail::spmv_alpha_beta, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
- viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(vec.start()),
- static_cast<unsigned int>(vec.stride()),
- alpha,
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(result.start()),
- static_cast<unsigned int>(result.stride()),
- static_cast<unsigned int>(result.size()),
- beta
- );
- else
- compressed_matrix_vec_mul_kernel<16, detail::spmv_pure, NumericT><<<512, 256>>>( // Fermi and Kepler prefer 16 threads per row (half-warp)
- viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(vec.start()),
- static_cast<unsigned int>(vec.stride()),
- alpha,
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(result.start()),
- static_cast<unsigned int>(result.stride()),
- static_cast<unsigned int>(result.size()),
- beta
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
- }
- else
- {
- if (alpha < NumericT(1) || alpha > NumericT(1) || beta < 0 || beta > 0)
- compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_alpha_beta><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<unsigned int>(mat.handle3()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- static_cast<unsigned int>(mat.blocks1()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(vec.start()),
- static_cast<unsigned int>(vec.stride()),
- alpha,
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(result.start()),
- static_cast<unsigned int>(result.stride()),
- static_cast<unsigned int>(result.size()),
- beta
- );
- else
- compressed_matrix_vec_mul_adaptive_kernel<detail::spmv_pure><<<512, 256>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<unsigned int>(mat.handle3()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- static_cast<unsigned int>(mat.blocks1()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(vec.start()),
- static_cast<unsigned int>(vec.stride()),
- alpha,
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(result.start()),
- static_cast<unsigned int>(result.stride()),
- static_cast<unsigned int>(result.size()),
- beta
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_adaptive_kernel");
- }
-}
-
-/** @brief Helper struct for accessing an element of a row- or column-major matrix.
- *
- * @param LayoutT The layout tag: Either row_major or column_major
- */
-template<typename LayoutT>
-struct mat_mult_matrix_index
-{
- static __device__ unsigned int apply(unsigned int i, unsigned int j,
- unsigned int row_start, unsigned int row_inc,
- unsigned int col_start, unsigned int col_inc,
- unsigned int internal_rows, unsigned int internal_cols)
- {
- return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
- }
-};
-
-/** \cond */
-template<>
-struct mat_mult_matrix_index<viennacl::column_major>
-{
- static __device__ unsigned int apply(unsigned int i, unsigned int j,
- unsigned int row_start, unsigned int row_inc,
- unsigned int col_start, unsigned int col_inc,
- unsigned int internal_rows, unsigned int internal_cols)
- {
- return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
- }
-};
-/** \endcond */
-
-
-template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
-__global__ void compressed_matrix_d_mat_mul_kernel(
- const unsigned int * sp_mat_row_indices,
- const unsigned int * sp_mat_col_indices,
- const NumericT * sp_mat_elements,
- const NumericT * d_mat,
- unsigned int d_mat_row_start,
- unsigned int d_mat_col_start,
- unsigned int d_mat_row_inc,
- unsigned int d_mat_col_inc,
- unsigned int d_mat_row_size,
- unsigned int d_mat_col_size,
- unsigned int d_mat_internal_rows,
- unsigned int d_mat_internal_cols,
- NumericT * result,
- unsigned int result_row_start,
- unsigned int result_col_start,
- unsigned int result_row_inc,
- unsigned int result_col_inc,
- unsigned int result_row_size,
- unsigned int result_col_size,
- unsigned int result_internal_rows,
- unsigned int result_internal_cols)
-{
- for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
- {
- unsigned int row_start = sp_mat_row_indices[row];
- unsigned int row_end = sp_mat_row_indices[row+1];
-
- for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
- {
- NumericT r = 0;
-
- for (unsigned int k = row_start; k < row_end; k++)
- {
- unsigned int j = sp_mat_col_indices[k];
- NumericT x = sp_mat_elements[k];
- NumericT y = d_mat[ DMatIndexT::apply(j, col,
- d_mat_row_start, d_mat_row_inc,
- d_mat_col_start, d_mat_col_inc,
- d_mat_internal_rows, d_mat_internal_cols) ];
-
- r += x * y;
- }
-
- result[ResultIndexT::apply(row, col,
- result_row_start, result_row_inc,
- result_col_start, result_col_inc,
- result_internal_rows, result_internal_cols)] = r;
- }
- }
-}
-
-
-/** @brief Carries out sparse_matrix-dense_matrix multiplication first matrix being compressed
-*
-* Implementation of the convenience expression result = prod(mat, vec);
-*
-* @param sp_mat The sparse matrix
-* @param d_mat The dense matrix
-* @param result The result matrix
-*/
-template<typename NumericT, unsigned int AlignmentV>
-void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
- const viennacl::matrix_base<NumericT> & d_mat,
- viennacl::matrix_base<NumericT> & result)
-{
- if (d_mat.row_major() && result.row_major())
- {
- compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
-
- viennacl::cuda_arg(d_mat),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
- }
- else if (d_mat.row_major() && !result.row_major())
- {
- compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
-
- viennacl::cuda_arg(d_mat),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
- }
- else if (!d_mat.row_major() && result.row_major())
- {
- compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
-
- viennacl::cuda_arg(d_mat),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
- }
- else
- {
- compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
-
- viennacl::cuda_arg(d_mat),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
- }
-}
-
-
-template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
-__global__ void compressed_matrix_d_tr_mat_mul_kernel(
- const unsigned int * sp_mat_row_indices,
- const unsigned int * sp_mat_col_indices,
- const NumericT * sp_mat_elements,
- const NumericT * d_mat,
- unsigned int d_mat_row_start,
- unsigned int d_mat_col_start,
- unsigned int d_mat_row_inc,
- unsigned int d_mat_col_inc,
- unsigned int d_mat_row_size,
- unsigned int d_mat_col_size,
- unsigned int d_mat_internal_rows,
- unsigned int d_mat_internal_cols,
- NumericT * result,
- unsigned int result_row_start,
- unsigned int result_col_start,
- unsigned int result_row_inc,
- unsigned int result_col_inc,
- unsigned int result_row_size,
- unsigned int result_col_size,
- unsigned int result_internal_rows,
- unsigned int result_internal_cols)
-{
- for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
- {
- unsigned int row_start = sp_mat_row_indices[row];
- unsigned int row_end = sp_mat_row_indices[row+1];
-
- for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
- {
- NumericT r = 0;
-
- for (unsigned int k = row_start; k < row_end; k++)
- {
- unsigned int j = sp_mat_col_indices[k];
- NumericT x = sp_mat_elements[k];
- NumericT y = d_mat[ DMatIndexT::apply(col, j,
- d_mat_row_start, d_mat_row_inc,
- d_mat_col_start, d_mat_col_inc,
- d_mat_internal_rows, d_mat_internal_cols) ];
-
- r += x * y;
- }
-
- result [ ResultIndexT::apply(row, col,
- result_row_start, result_row_inc,
- result_col_start, result_col_inc,
- result_internal_rows, result_internal_cols) ] = r;
- }
- }
-
-}
-
-/** @brief Carries out matrix-trans(matrix) multiplication first matrix being compressed
-* and the second transposed
-*
-* Implementation of the convenience expression result = prod(sp_mat, d_mat);
-*
-* @param sp_mat The sparse matrix
-* @param d_mat The transposed dense matrix proxy
-* @param result The result matrix
-*/
-template<typename NumericT, unsigned int AlignmentV>
-void prod_impl(const viennacl::compressed_matrix<NumericT, AlignmentV> & sp_mat,
- const viennacl::matrix_expression< const viennacl::matrix_base<NumericT>,
- const viennacl::matrix_base<NumericT>,
- viennacl::op_trans > & d_mat,
- viennacl::matrix_base<NumericT> & result)
-{
-
- if (d_mat.lhs().row_major() && result.row_major())
- {
- compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
-
- viennacl::cuda_arg(d_mat.lhs()),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
- }
- else if (d_mat.lhs().row_major() && !result.row_major())
- {
- compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
-
- viennacl::cuda_arg(d_mat.lhs()),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
- }
- else if (!d_mat.lhs().row_major() && result.row_major())
- {
- compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
-
- viennacl::cuda_arg(d_mat.lhs()),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
- }
- else
- {
- compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle1()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle2()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
-
- viennacl::cuda_arg(d_mat.lhs()),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
- }
-}
-
-
-//
-// triangular solves for compressed_matrix
-//
-
-template<typename NumericT>
-__global__ void compressed_matrix_diagonal_kernel(
- const unsigned int * row_indices,
- const unsigned int * column_indices,
- const NumericT * elements,
- NumericT * result,
- unsigned int size)
-{
- for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
- row < size;
- row += gridDim.x * blockDim.x)
- {
- NumericT diag = NumericT(0);
- unsigned int row_end = row_indices[row+1];
- for (unsigned int i = row_indices[row]; i < row_end; ++i)
- {
- unsigned int col_index = column_indices[i];
- if (col_index == row)
- {
- diag = elements[i];
- break;
- }
- }
- result[row] = diag;
- }
-}
-
-
-/** @brief Carries out triangular inplace solves
-*
-* @param mat The matrix
-* @param vec The vector holding the right hand side. Is overwritten by the solution.
-*/
-template<typename SparseMatrixT, typename NumericT>
-typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
-inplace_solve(const SparseMatrixT & mat,
- viennacl::vector_base<NumericT> & vec,
- viennacl::linalg::unit_lower_tag)
-{
- csr_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.size1())
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
-}
-
-
-/** @brief Carries out triangular inplace solves
-*
-* @param mat The matrix
-* @param vec The vector holding the right hand side. Is overwritten by the solution.
-*/
-template<typename SparseMatrixT, typename NumericT>
-typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
-inplace_solve(const SparseMatrixT & mat,
- viennacl::vector_base<NumericT> & vec,
- viennacl::linalg::lower_tag)
-{
- csr_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.size1())
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
-}
-
-
-
-/** @brief Carries out triangular inplace solves
-*
-* @param mat The matrix
-* @param vec The vector holding the right hand side. Is overwritten by the solution.
-*/
-template<typename SparseMatrixT, typename NumericT>
-typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
-inplace_solve(const SparseMatrixT & mat,
- viennacl::vector_base<NumericT> & vec,
- viennacl::linalg::unit_upper_tag)
-{
- csr_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.size1())
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
-}
-
-
-/** @brief Carries out triangular inplace solves
-*
-* @param mat The matrix
-* @param vec The vector holding the right hand side. Is overwritten by the solution.
-*/
-template<typename SparseMatrixT, typename NumericT>
-typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
-inplace_solve(const SparseMatrixT & mat,
- viennacl::vector_base<NumericT> & vec,
- viennacl::linalg::upper_tag)
-{
- csr_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.size1())
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
-}
-
-
-
-// transposed
-
-/** @brief Carries out triangular inplace solves
-*
-* @param mat The matrix
-* @param vec The vector holding the right hand side. Is overwritten by the solution.
-*/
-template<typename SparseMatrixT, typename NumericT>
-typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
-inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
- viennacl::vector_base<NumericT> & vec,
- viennacl::linalg::unit_lower_tag)
-{
- csr_trans_unit_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
- viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
- viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.lhs().size1())
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
-}
-
-
-/** @brief Carries out triangular inplace solves
-*
-* @param mat The matrix
-* @param vec The vector holding the right hand side. Is overwritten by the solution.
-*/
-template<typename SparseMatrixT, typename NumericT>
-typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
-inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
- viennacl::vector_base<NumericT> & vec,
- viennacl::linalg::lower_tag)
-{
- viennacl::vector<NumericT> diagonal(vec.size());
-
- compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
- viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
- viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
- viennacl::cuda_arg(diagonal),
- static_cast<unsigned int>(mat.size1())
- );
-
- csr_trans_lu_forward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
- viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
- viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
- viennacl::cuda_arg(diagonal),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.lhs().size1())
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
-}
-
-
-/** @brief Carries out triangular inplace solves
-*
-* @param mat The matrix
-* @param vec The vector holding the right hand side. Is overwritten by the solution.
-*/
-template<typename SparseMatrixT, typename NumericT>
-typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
-inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
- viennacl::vector_base<NumericT> & vec,
- viennacl::linalg::unit_upper_tag)
-{
- csr_trans_unit_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
- viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
- viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.lhs().size1())
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
-}
-
-
-/** @brief Carries out triangular inplace solves
-*
-* @param mat The matrix
-* @param vec The vector holding the right hand side. Is overwritten by the solution.
-*/
-template<typename SparseMatrixT, typename NumericT>
-typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixT>::value>::type
-inplace_solve(const matrix_expression<const SparseMatrixT, const SparseMatrixT, op_trans> & mat,
- viennacl::vector_base<NumericT> & vec,
- viennacl::linalg::upper_tag)
-{
- viennacl::vector<NumericT> diagonal(vec.size());
-
- compressed_matrix_diagonal_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
- viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
- viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
- viennacl::cuda_arg(diagonal),
- static_cast<unsigned int>(mat.size1())
- );
-
- csr_trans_lu_backward_kernel<<<1, 128>>>(viennacl::cuda_arg<unsigned int>(mat.lhs().handle1()),
- viennacl::cuda_arg<unsigned int>(mat.lhs().handle2()),
- viennacl::cuda_arg<NumericT>(mat.lhs().handle()),
- viennacl::cuda_arg(diagonal),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(mat.lhs().size1())
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
-}
-
-namespace detail
-{
- //
- // block solves
- //
- template<typename NumericT, unsigned int AlignmentV>
- void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
- const compressed_matrix<NumericT, AlignmentV>,
- op_trans> & L,
- viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
- vector_base<NumericT> const & /* L_diagonal */, //ignored
- vector_base<NumericT> & vec,
- viennacl::linalg::unit_lower_tag)
- {
- csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(L.lhs().handle1()),
- viennacl::cuda_arg<unsigned int>(L.lhs().handle2()),
- viennacl::cuda_arg<NumericT>(L.lhs().handle()),
- viennacl::cuda_arg<unsigned int>(block_indices),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(L.lhs().size1())
- );
- }
-
-
- template<typename NumericT, unsigned int AlignmentV>
- void block_inplace_solve(const matrix_expression<const compressed_matrix<NumericT, AlignmentV>,
- const compressed_matrix<NumericT, AlignmentV>,
- op_trans> & U,
- viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
- vector_base<NumericT> const & U_diagonal,
- vector_base<NumericT> & vec,
- viennacl::linalg::upper_tag)
- {
- csr_block_trans_lu_backward<<<num_blocks, 128>>>(viennacl::cuda_arg<unsigned int>(U.lhs().handle1()),
- viennacl::cuda_arg<unsigned int>(U.lhs().handle2()),
- viennacl::cuda_arg<NumericT>(U.lhs().handle()),
- viennacl::cuda_arg(U_diagonal),
- viennacl::cuda_arg<unsigned int>(block_indices),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(U.lhs().size1())
- );
- }
-
-
-}
-
-
-//
-// Compressed Compressed Matrix
-//
-
-template<typename NumericT>
-__global__ void compressed_compressed_matrix_vec_mul_kernel(
- const unsigned int * row_jumper,
- const unsigned int * row_indices,
- const unsigned int * column_indices,
- const NumericT * elements,
- unsigned int nonzero_rows,
- const NumericT * x,
- unsigned int start_x,
- unsigned int inc_x,
- NumericT alpha,
- NumericT * result,
- unsigned int start_result,
- unsigned int inc_result,
- unsigned int size_result,
- NumericT beta)
-{
- for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
- i < nonzero_rows;
- i += gridDim.x * blockDim.x)
- {
- NumericT dot_prod = NumericT(0);
- unsigned int row_end = row_jumper[i+1];
- for (unsigned int j = row_jumper[i]; j < row_end; ++j)
- dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
-
- unsigned int index = row_indices[i] * inc_result + start_result;
- if (beta != 0) result[index] += alpha * dot_prod;
- else result[index] = alpha * dot_prod;
- }
-}
-
-
-/** @brief Carries out matrix-vector multiplication with a compressed_compressed_matrix
-*
-* Implementation of the convenience expression result = prod(mat, vec);
-*
-* @param mat The matrix
-* @param vec The vector
-* @param result The result vector
-*/
-template<typename NumericT>
-void prod_impl(const viennacl::compressed_compressed_matrix<NumericT> & mat,
- const viennacl::vector_base<NumericT> & vec,
- NumericT alpha,
- viennacl::vector_base<NumericT> & result,
- NumericT beta)
-{
- if (beta < 0 || beta > 0)
- viennacl::linalg::cuda::av(result, result, beta, 1, false, false);
- else
- result.clear();
-
- compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle1()),
- viennacl::cuda_arg<unsigned int>(mat.handle3()),
- viennacl::cuda_arg<unsigned int>(mat.handle2()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- static_cast<unsigned int>(mat.nnz1()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(vec.start()),
- static_cast<unsigned int>(vec.stride()),
- alpha,
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(result.start()),
- static_cast<unsigned int>(result.stride()),
- static_cast<unsigned int>(result.size()),
- beta
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
-}
-
-//
-// Coordinate Matrix
-//
-
-
-namespace detail
-{
-
- template<typename NumericT>
- __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
- const NumericT * elements,
- const unsigned int * group_boundaries,
- NumericT * result,
- unsigned int option)
- {
- __shared__ unsigned int shared_rows[128];
- __shared__ NumericT inter_results[128];
-
- uint2 tmp;
- NumericT val;
- unsigned int last_index = blockDim.x - 1;
- unsigned int group_start = group_boundaries[blockIdx.x];
- unsigned int group_end = group_boundaries[blockIdx.x + 1];
- unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
-
- unsigned int local_index = 0;
-
- for (unsigned int k = 0; k < k_end; ++k)
- {
- local_index = group_start + k * blockDim.x + threadIdx.x;
-
- tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
- val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
-
- //check for carry from previous loop run:
- if (threadIdx.x == 0 && k > 0)
- {
- if (tmp.x == shared_rows[last_index])
- {
- switch (option)
- {
- case 0: //inf-norm
- case 3: //diagonal entry
- val = max(val, fabs(inter_results[last_index]));
- break;
-
- case 1: //1-norm
- val = fabs(val) + inter_results[last_index];
- break;
-
- case 2: //2-norm
- val = sqrt(val * val + inter_results[last_index]);
- break;
-
- default:
- break;
- }
- }
- else
- {
- switch (option)
- {
- case 0: //inf-norm
- case 1: //1-norm
- case 3: //diagonal entry
- result[shared_rows[last_index]] = inter_results[last_index];
- break;
-
- case 2: //2-norm
- result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
- default:
- break;
- }
- }
- }
-
- //segmented parallel reduction begin
- __syncthreads();
- shared_rows[threadIdx.x] = tmp.x;
- switch (option)
- {
- case 0:
- case 3:
- inter_results[threadIdx.x] = val;
- break;
- case 1:
- inter_results[threadIdx.x] = fabs(val);
- break;
- case 2:
- inter_results[threadIdx.x] = val * val;
- default:
- break;
- }
- __syncthreads();
-
- for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
- {
- NumericT left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
- __syncthreads();
- switch (option)
- {
- case 0: //inf-norm
- case 3: //diagonal entry
- inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
- break;
-
- case 1: //1-norm
- inter_results[threadIdx.x] += left;
- break;
-
- case 2: //2-norm
- inter_results[threadIdx.x] += left;
- break;
-
- default:
- break;
- }
- __syncthreads();
- }
- //segmented parallel reduction end
-
- if (threadIdx.x != last_index &&
- shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
- inter_results[threadIdx.x] != 0)
- {
- result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
- }
-
- __syncthreads();
- } //for k
-
- if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
- result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
- }
-
- template<typename NumericT, unsigned int AlignmentV>
- void row_info(coordinate_matrix<NumericT, AlignmentV> const & mat,
- vector_base<NumericT> & vec,
- viennacl::linalg::detail::row_info_types info_selector)
- {
- coo_row_info_extractor<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg<unsigned int>(mat.handle3()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(info_selector)
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
- }
-
-} //namespace detail
-
-
-template<typename NumericT>
-__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
- const NumericT * elements,
- const unsigned int * group_boundaries,
- const NumericT * x,
- unsigned int start_x,
- unsigned int inc_x,
- NumericT alpha,
- NumericT * result,
- unsigned int start_result,
- unsigned int inc_result,
- NumericT beta)
-{
- __shared__ unsigned int shared_rows[128];
- __shared__ NumericT inter_results[128];
-
- uint2 tmp;
- NumericT val;
- unsigned int group_start = group_boundaries[blockIdx.x];
- unsigned int group_end = group_boundaries[blockIdx.x + 1];
- unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
-
- unsigned int local_index = 0;
-
- for (unsigned int k = 0; k < k_end; ++k)
- {
- local_index = group_start + k * blockDim.x + threadIdx.x;
-
- tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
- val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
-
- //check for carry from previous loop run:
- if (threadIdx.x == 0 && k > 0)
- {
- if (tmp.x == shared_rows[blockDim.x-1])
- val += inter_results[blockDim.x-1];
- else if (beta != 0)
- result[shared_rows[blockDim.x-1] * inc_result + start_result] += alpha * inter_results[blockDim.x-1];
- else
- result[shared_rows[blockDim.x-1] * inc_result + start_result] = alpha * inter_results[blockDim.x-1];
- }
-
- //segmented parallel reduction begin
- __syncthreads();
- shared_rows[threadIdx.x] = tmp.x;
- inter_results[threadIdx.x] = val;
- NumericT left = 0;
- __syncthreads();
-
- for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
- {
- left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
- __syncthreads();
- inter_results[threadIdx.x] += left;
- __syncthreads();
- }
- //segmented parallel reduction end
-
- if (local_index < group_end - 1 && threadIdx.x < blockDim.x-1 &&
- shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
- {
- if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
- else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x];
- }
-
- __syncthreads();
- } //for k
-
- if (local_index + 1 == group_end) {
- if (beta != 0) result[tmp.x * inc_result + start_result] += alpha * inter_results[threadIdx.x];
- else result[tmp.x * inc_result + start_result] = alpha * inter_results[threadIdx.x];
- }
-}
-
-
-/** @brief Carries out matrix-vector multiplication with a coordinate_matrix
-*
-* Implementation of the convenience expression result = prod(mat, vec);
-*
-* @param mat The matrix
-* @param vec The vector
-* @param result The result vector
-*/
-template<typename NumericT, unsigned int AlignmentV>
-void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & mat,
- const viennacl::vector_base<NumericT> & vec,
- NumericT alpha,
- viennacl::vector_base<NumericT> & result,
- NumericT beta)
-{
- if (beta < 0 || beta > 0)
- viennacl::linalg::cuda::av(result, result, beta, 1, false, false);
- else
- result.clear();
-
- coordinate_matrix_vec_mul_kernel<<<64, 128>>>(viennacl::cuda_arg<unsigned int>(mat.handle12()),
- viennacl::cuda_arg<NumericT>(mat.handle()),
- viennacl::cuda_arg<unsigned int>(mat.handle3()),
- viennacl::cuda_arg(vec),
- static_cast<unsigned int>(vec.start()),
- static_cast<unsigned int>(vec.stride()),
- alpha,
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(result.start()),
- static_cast<unsigned int>(result.stride()),
- beta
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
-}
-
-
-
-
-template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
-__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
- const NumericT * elements,
- const unsigned int * group_boundaries,
- const NumericT * d_mat,
- unsigned int d_mat_row_start,
- unsigned int d_mat_col_start,
- unsigned int d_mat_row_inc,
- unsigned int d_mat_col_inc,
- unsigned int d_mat_row_size,
- unsigned int d_mat_col_size,
- unsigned int d_mat_internal_rows,
- unsigned int d_mat_internal_cols,
- NumericT * result,
- unsigned int result_row_start,
- unsigned int result_col_start,
- unsigned int result_row_inc,
- unsigned int result_col_inc,
- unsigned int result_row_size,
- unsigned int result_col_size,
- unsigned int result_internal_rows,
- unsigned int result_internal_cols)
-{
- __shared__ unsigned int shared_rows[128];
- __shared__ NumericT inter_results[128];
-
- uint2 tmp;
- NumericT val;
- unsigned int group_start = group_boundaries[blockIdx.x];
- unsigned int group_end = group_boundaries[blockIdx.x + 1];
- unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
-
- unsigned int local_index = 0;
-
- for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
- {
- for (unsigned int k = 0; k < k_end; ++k)
- {
- local_index = group_start + k * blockDim.x + threadIdx.x;
-
- tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
- val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
- d_mat_row_start, d_mat_row_inc,
- d_mat_col_start, d_mat_col_inc,
- d_mat_internal_rows, d_mat_internal_cols) ] : 0;
-
- //check for carry from previous loop run:
- if (threadIdx.x == 0 && k > 0)
- {
- if (tmp.x == shared_rows[blockDim.x-1])
- val += inter_results[blockDim.x-1];
- else
- result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
- result_row_start, result_row_inc,
- result_col_start, result_col_inc,
- result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
- }
-
- //segmented parallel reduction begin
- __syncthreads();
- shared_rows[threadIdx.x] = tmp.x;
- inter_results[threadIdx.x] = val;
- NumericT left = 0;
- __syncthreads();
-
- for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
- {
- left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
- __syncthreads();
- inter_results[threadIdx.x] += left;
- __syncthreads();
- }
- //segmented parallel reduction end
-
- if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
- shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
- {
- result[ResultIndexT::apply(tmp.x, result_col,
- result_row_start, result_row_inc,
- result_col_start, result_col_inc,
- result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
- }
-
- __syncthreads();
- } //for k
-
- if (local_index + 1 == group_end)
- result[ResultIndexT::apply(tmp.x, result_col,
- result_row_start, result_row_inc,
- result_col_start, result_col_inc,
- result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
- }
-}
-
-
-/** @brief Carries out Compressed Matrix(COO)-Dense Matrix multiplication
-*
-* Implementation of the convenience expression result = prod(sp_mat, d_mat);
-*
-* @param sp_mat The Sparse Matrix (Coordinate format)
-* @param d_mat The Dense Matrix
-* @param result The Result Matrix
-*/
-template<typename NumericT, unsigned int AlignmentV>
-void prod_impl(const viennacl::coordinate_matrix<NumericT, AlignmentV> & sp_mat,
- const viennacl::matrix_base<NumericT> & d_mat,
- viennacl::matrix_base<NumericT> & result)
-{
- if (d_mat.row_major() && result.row_major())
- {
- coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
-
- viennacl::cuda_arg(d_mat),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
- }
- else if (d_mat.row_major() && !result.row_major())
- {
- coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
-
- viennacl::cuda_arg(d_mat),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
- }
- else if (!d_mat.row_major() && result.row_major())
- {
- coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
-
- viennacl::cuda_arg(d_mat),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
- }
- else
- {
- coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
- (viennacl::cuda_arg<unsigned int>(sp_mat.handle12()),
- viennacl::cuda_arg<NumericT>(sp_mat.handle()),
- viennacl::cuda_arg<unsigned int>(sp_mat.handle3()),
-
- viennacl::cuda_arg(d_mat),
- static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
-
- viennacl::cuda_arg(result),
- static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
- static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
- static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
- static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
- );
- VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
- }
-
-}
-
-template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
-__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
- const NumericT * elements,
- const unsigned int * group_boundaries,
- const NumericT * d_mat,
- unsigned int d_mat_row_start,
- unsigned int d_mat_col_start,
- unsigned int d_mat_row_inc,
- unsigned int d_mat_col_inc,
- unsigned int d_mat_row_size,
- unsigned int d_mat_col_size,
- unsigned int d_mat_internal_rows,
- unsigned int d_mat_internal_cols,
- NumericT * result,
- unsigned int result_row_start,
- unsigned int result_col_start,
- unsigned int result_row_inc,
- unsigned int result_col_inc,
- unsigned int result_row_size,
- unsigned int result_col_size,
- unsigned int result_internal_rows,
- unsigned int result_internal_cols)
-{
- __shared__ unsigned int shared_rows[128];
- __shared__ NumericT inter_results[128];
-
- uint2 tmp;
- NumericT val;
- unsigned int group_start = group_boundaries[blockIdx.x];
- unsigned int group_end = group_boundaries[blockIdx.x + 1];
- unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
-
-
<TRUNCATED>