You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:07 UTC

[10/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
new file mode 100644
index 0000000..8645e7d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/compressed_matrix.hpp
@@ -0,0 +1,1703 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
+#define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+/** @file viennacl/linalg/opencl/kernels/compressed_matrix.hpp
+ *  @brief OpenCL kernel file for compressed_matrix operations */
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+
+//////////////////////////// Part 1: Kernel generation routines ////////////////////////////////////
+
+template<typename StringT>
+void generate_compressed_matrix_block_trans_lu_backward(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void block_trans_lu_backward( \n");
+  source.append("  __global const unsigned int * row_jumper_U,  \n");     //U part (note that U is transposed in memory)
+  source.append("  __global const unsigned int * column_indices_U, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements_U, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n");
+  source.append("  __global const unsigned int * block_offsets, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
+  source.append("  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
+  source.append("  unsigned int row_start; \n");
+  source.append("  unsigned int row_stop; \n");
+  source.append("  "); source.append(numeric_string); source.append(" result_entry = 0; \n");
+
+  source.append("  if (col_start >= col_stop) \n");
+  source.append("    return; \n");
+
+    //backward elimination, using U and diagonal_U
+  source.append("  for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
+  source.append("  { \n");
+  source.append("    unsigned int col = (col_stop - iter) - 1; \n");
+  source.append("    result_entry = result[col] / diagonal_U[col]; \n");
+  source.append("    row_start = row_jumper_U[col]; \n");
+  source.append("    row_stop  = row_jumper_U[col + 1]; \n");
+  source.append("    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
+  source.append("      result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } \n");
+
+    //divide result vector by diagonal:
+  source.append("  for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
+  source.append("    result[col] /= diagonal_U[col]; \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_block_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void block_trans_unit_lu_forward( \n");
+  source.append("  __global const unsigned int * row_jumper_L,  \n");     //L part (note that L is transposed in memory)
+  source.append("  __global const unsigned int * column_indices_L, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements_L, \n");
+  source.append("  __global const unsigned int * block_offsets, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
+  source.append("  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
+  source.append("  unsigned int row_start = row_jumper_L[col_start]; \n");
+  source.append("  unsigned int row_stop; \n");
+  source.append("  "); source.append(numeric_string); source.append(" result_entry = 0; \n");
+
+  source.append("  if (col_start >= col_stop) \n");
+  source.append("    return; \n");
+
+    //forward elimination, using L:
+  source.append("  for (unsigned int col = col_start; col < col_stop; ++col) \n");
+  source.append("  { \n");
+  source.append("    result_entry = result[col]; \n");
+  source.append("    row_stop = row_jumper_L[col + 1]; \n");
+  source.append("    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
+  source.append("      result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
+  source.append("    row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM)
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } \n");
+
+  source.append("}; \n");
+}
+
+namespace detail
+{
+  /** @brief Generate kernel for C = A * B with A being a compressed_matrix, B and C dense */
+  template<typename StringT>
+  void generate_compressed_matrix_dense_matrix_mult(StringT & source, std::string const & numeric_string,
+                                                    bool B_transposed, bool B_row_major, bool C_row_major)
+  {
+    source.append("__kernel void ");
+    source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
+    source.append("( \n");
+    source.append("  __global const unsigned int * sp_mat_row_indices, \n");
+    source.append("  __global const unsigned int * sp_mat_col_indices, \n");
+    source.append("  __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n");
+    source.append("  __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
+    source.append("  unsigned int d_mat_row_start, \n");
+    source.append("  unsigned int d_mat_col_start, \n");
+    source.append("  unsigned int d_mat_row_inc, \n");
+    source.append("  unsigned int d_mat_col_inc, \n");
+    source.append("  unsigned int d_mat_row_size, \n");
+    source.append("  unsigned int d_mat_col_size, \n");
+    source.append("  unsigned int d_mat_internal_rows, \n");
+    source.append("  unsigned int d_mat_internal_cols, \n");
+    source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+    source.append("  unsigned int result_row_start, \n");
+    source.append("  unsigned int result_col_start, \n");
+    source.append("  unsigned int result_row_inc, \n");
+    source.append("  unsigned int result_col_inc, \n");
+    source.append("  unsigned int result_row_size, \n");
+    source.append("  unsigned int result_col_size, \n");
+    source.append("  unsigned int result_internal_rows, \n");
+    source.append("  unsigned int result_internal_cols) { \n");
+
+      // split work rows (sparse matrix rows) to thread groups
+    source.append("  for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
+
+    source.append("    unsigned int row_start = sp_mat_row_indices[row]; \n");
+    source.append("    unsigned int row_end = sp_mat_row_indices[row+1]; \n");
+
+        // split result cols between threads in a thread group
+    source.append("    for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
+
+    source.append("      "); source.append(numeric_string); source.append(" r = 0; \n");
+
+    source.append("      for (unsigned int k = row_start; k < row_end; k ++) { \n");
+
+    source.append("        unsigned int j = sp_mat_col_indices[k]; \n");
+    source.append("        "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n");
+
+    source.append("        "); source.append(numeric_string);
+    if (B_transposed && B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start +   j * d_mat_col_inc ]; \n");
+    else if (B_transposed && !B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc)                       + (d_mat_col_start +  j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+    else if (!B_transposed && B_row_major)
+      source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
+    else
+      source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc)                       + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
+    source.append("        r += x * y; \n");
+    source.append("      } \n");
+
+    if (C_row_major)
+      source.append("      result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
+    else
+      source.append("      result[ (result_row_start + row * result_row_inc)                        + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
+    source.append("    } \n");
+    source.append("  } \n");
+
+    source.append("} \n");
+
+  }
+}
+template<typename StringT>
+void generate_compressed_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
+{
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false,  true);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false,  true, false);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false,  true,  true);
+
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false,  true);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true,  true, false);
+  detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true,  true,  true);
+}
+
+template<typename StringT>
+void generate_compressed_matrix_jacobi(StringT & source, std::string const & numeric_string)
+{
+
+ source.append(" __kernel void jacobi( \n");
+ source.append("  __global const unsigned int * row_indices, \n");
+ source.append("  __global const unsigned int * column_indices, \n");
+ source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append("  "); source.append(numeric_string); source.append(" weight, \n");
+ source.append("  __global const "); source.append(numeric_string); source.append(" * old_result, \n");
+ source.append("  __global "); source.append(numeric_string); source.append(" * new_result, \n");
+ source.append("  __global const "); source.append(numeric_string); source.append(" * rhs, \n");
+ source.append("  unsigned int size) \n");
+ source.append("  { \n");
+ source.append("   "); source.append(numeric_string); source.append(" sum, diag=1; \n");
+ source.append("   int col; \n");
+ source.append("   for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
+ source.append("   { \n");
+ source.append("     sum = 0; \n");
+ source.append("     for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
+ source.append("     { \n");
+ source.append("       col = column_indices[j]; \n");
+ source.append("       if (i == col) \n");
+ source.append("         diag = elements[j]; \n");
+ source.append("       else \n");
+ source.append("         sum += elements[j] * old_result[col]; \n");
+ source.append("     } \n");
+ source.append("     new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
+ source.append("    } \n");
+ source.append("  } \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_lu_backward(StringT & source, std::string const & numeric_string)
+{
+  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void lu_backward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int col_index_buffer[128]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int current_row = size-1; \n");
+  source.append("  unsigned int row_at_window_start = size-1; \n");
+  source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
+  source.append("  "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
+  source.append("  unsigned int next_row = row_indices[size-1]; \n");
+
+  source.append("  unsigned int i = loop_end + get_local_id(0); \n");
+  source.append("  while (1) \n");
+  source.append("  { \n");
+      //load into shared memory (coalesced access):
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+  source.append("      unsigned int tmp = column_indices[i]; \n");
+  source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+  source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      //now a single thread does the remaining work in shared memory:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("    { \n");
+        // traverse through all the loaded data from back to front:
+  source.append("      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
+  source.append("      { \n");
+  source.append("        unsigned int k = (get_local_size(0) - k2) - 1; \n");
+
+  source.append("        if (i+k >= nnz) \n");
+  source.append("          continue; \n");
+
+  source.append("        if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
+  source.append("          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+  source.append("        else if (col_index_buffer[k] > current_row) \n"); //use buffered data
+  source.append("          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+  source.append("        else if (col_index_buffer[k] == current_row) \n");
+  source.append("          diagonal_entry = element_buffer[k]; \n");
+
+  source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+  source.append("        { \n");
+  source.append("          vector[current_row] = current_vector_entry / diagonal_entry; \n");
+  source.append("          if (current_row > 0) //load next row's data \n");
+  source.append("          { \n");
+  source.append("            --current_row; \n");
+  source.append("            next_row = row_indices[current_row]; \n");
+  source.append("            current_vector_entry = vector[current_row]; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+
+
+  source.append("      } \n"); // for k
+
+  source.append("      row_at_window_start = current_row; \n");
+  source.append("    } \n"); // if (get_local_id(0) == 0)
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+  source.append("    if (i < get_local_size(0)) \n");
+  source.append("      break; \n");
+
+  source.append("    i -= get_local_size(0); \n");
+  source.append("  } \n"); //for i
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void lu_forward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int col_index_buffer[128]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int current_row = 0; \n");
+  source.append("  unsigned int row_at_window_start = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
+  source.append("  "); source.append(numeric_string); source.append(" diagonal_entry; \n");
+  source.append("  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
+  source.append("  unsigned int next_row = row_indices[1]; \n");
+
+  source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+  source.append("  { \n");
+      //load into shared memory (coalesced access):
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+  source.append("      unsigned int tmp = column_indices[i]; \n");
+  source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+  source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      //now a single thread does the remaining work in shared memory:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("    { \n");
+        // traverse through all the loaded data:
+  source.append("      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
+  source.append("      { \n");
+  source.append("        if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result
+  source.append("        { \n");
+  source.append("          vector[current_row] = current_vector_entry / diagonal_entry; \n");
+  source.append("          ++current_row; \n");
+  source.append("          if (current_row < size) \n"); //load next row's data
+  source.append("          { \n");
+  source.append("            next_row = row_indices[current_row+1]; \n");
+  source.append("            current_vector_entry = vector[current_row]; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+
+  source.append("        if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
+  source.append("        { \n");
+  source.append("          if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
+  source.append("            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+  source.append("          else if (col_index_buffer[k] < current_row) \n"); //use buffered data
+  source.append("            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+  source.append("        } \n");
+  source.append("        else if (col_index_buffer[k] == current_row) \n");
+  source.append("          diagonal_entry = element_buffer[k]; \n");
+
+  source.append("      } \n"); // for k
+
+  source.append("      row_at_window_start = current_row; \n");
+  source.append("    } \n"); // if (get_local_id(0) == 0)
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } \n"); //for i
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void row_info_extractor( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  unsigned int size, \n");
+  source.append("  unsigned int option \n");
+  source.append("  ) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" value = 0; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+
+  source.append("    switch (option) \n");
+  source.append("    { \n");
+  source.append("      case 0: \n"); //inf-norm
+  source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("          value = max(value, fabs(elements[i])); \n");
+  source.append("        break; \n");
+
+  source.append("      case 1: \n"); //1-norm
+  source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("          value += fabs(elements[i]); \n");
+  source.append("        break; \n");
+
+  source.append("      case 2: \n"); //2-norm
+  source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("          value += elements[i] * elements[i]; \n");
+  source.append("        value = sqrt(value); \n");
+  source.append("        break; \n");
+
+  source.append("      case 3: \n"); //diagonal entry
+  source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("        { \n");
+  source.append("          if (column_indices[i] == row) \n");
+  source.append("          { \n");
+  source.append("            value = elements[i]; \n");
+  source.append("            break; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+  source.append("        break; \n");
+
+  source.append("      default: \n");
+  source.append("        break; \n");
+  source.append("    } \n");
+  source.append("    result[row] = value; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_lu_backward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int row_index_lookahead[256]; \n");
+  source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+  source.append("  unsigned int row_index; \n");
+  source.append("  unsigned int col_index; \n");
+  source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int row_at_window_start = size; \n");
+  source.append("  unsigned int row_at_window_end; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+  source.append("  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int i = (nnz - i2) - 1; \n");
+  source.append("    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
+  source.append("    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
+  source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    if (i2 < nnz) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row_index_dec = 0; \n");
+  source.append("      while (row_index_lookahead[row_index_dec] > i) \n");
+  source.append("        ++row_index_dec; \n");
+  source.append("      row_index = row_at_window_start - row_index_dec; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+  source.append("    } \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      row_index = size+1; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = 0; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    row_at_window_start = row_index_buffer[0]; \n");
+  source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+      //backward elimination
+  source.append("    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row = row_at_window_start - row2; \n");
+  source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
+
+  source.append("      if ( (row_index == row) && (col_index < row) ) \n");
+  source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+  source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    } \n");
+
+  source.append("    row_at_window_start = row_at_window_end; \n");
+  source.append("  } \n");
+
+    // final step: Divide vector by diagonal entries:
+  source.append("  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
+  source.append("    vector[i] /= diagonal_entries[i]; \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_lu_forward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int row_index_lookahead[256]; \n");
+  source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+  source.append("  unsigned int row_index; \n");
+  source.append("  unsigned int col_index; \n");
+  source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int row_at_window_start = 0; \n");
+  source.append("  unsigned int row_at_window_end = 0; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+  source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
+  source.append("    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
+  source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row_index_inc = 0; \n");
+  source.append("      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
+  source.append("        ++row_index_inc; \n");
+  source.append("      row_index = row_at_window_start + row_index_inc; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+  source.append("    } \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      row_index = size+1; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = size - 1; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    row_at_window_start = row_index_buffer[0]; \n");
+  source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+      //forward elimination
+  source.append("    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
+  source.append("    { \n");
+  source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
+
+  source.append("      if ( (row_index == row) && (col_index > row) ) \n");
+  source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+  source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    } \n");
+
+  source.append("    row_at_window_start = row_at_window_end; \n");
+  source.append("  } \n");
+
+    // final step: Divide vector by diagonal entries:
+  source.append("  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
+  source.append("    vector[i] /= diagonal_entries[i]; \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_unit_lu_backward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int row_index_lookahead[256]; \n");
+  source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+  source.append("  unsigned int row_index; \n");
+  source.append("  unsigned int col_index; \n");
+  source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int row_at_window_start = size; \n");
+  source.append("  unsigned int row_at_window_end; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+  source.append("  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    unsigned int i = (nnz - i2) - 1; \n");
+  source.append("    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
+  source.append("    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
+  source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    if (i2 < nnz) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row_index_dec = 0; \n");
+  source.append("      while (row_index_lookahead[row_index_dec] > i) \n");
+  source.append("        ++row_index_dec; \n");
+  source.append("      row_index = row_at_window_start - row_index_dec; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+  source.append("    } \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      row_index = size+1; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = 0; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    row_at_window_start = row_index_buffer[0]; \n");
+  source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+      //backward elimination
+  source.append("    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row = row_at_window_start - row2; \n");
+  source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+  source.append("      if ( (row_index == row) && (col_index < row) ) \n");
+  source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+  source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    } \n");
+
+  source.append("    row_at_window_start = row_at_window_end; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_unit_lu_forward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local unsigned int row_index_lookahead[256]; \n");
+  source.append("  __local unsigned int row_index_buffer[256]; \n");
+
+  source.append("  unsigned int row_index; \n");
+  source.append("  unsigned int col_index; \n");
+  source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int row_at_window_start = 0; \n");
+  source.append("  unsigned int row_at_window_end = 0; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
+
+  source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+  source.append("  { \n");
+  source.append("    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
+  source.append("    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
+  source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : nnz; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      unsigned int row_index_inc = 0; \n");
+  source.append("      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
+  source.append("        ++row_index_inc; \n");
+  source.append("      row_index = row_at_window_start + row_index_inc; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
+  source.append("    } \n");
+  source.append("    else \n");
+  source.append("    { \n");
+  source.append("      row_index = size+1; \n");
+  source.append("      row_index_buffer[get_local_id(0)] = size - 1; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("    row_at_window_start = row_index_buffer[0]; \n");
+  source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
+
+      //forward elimination
+  source.append("    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
+  source.append("    { \n");
+  source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+  source.append("      if ( (row_index == row) && (col_index > row) ) \n");
+  source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
+
+  source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("    } \n");
+
+  source.append("    row_at_window_start = row_at_window_end; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_trans_unit_lu_forward_slow(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void trans_unit_lu_forward_slow( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int row = 0; row < size; ++row) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
+
+  source.append("    unsigned int row_start = row_indices[row]; \n");
+  source.append("    unsigned int row_stop  = row_indices[row + 1]; \n");
+  source.append("    for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
+  source.append("    { \n");
+  source.append("      unsigned int col_index = column_indices[entry_index]; \n");
+  source.append("      if (col_index > row) \n");
+  source.append("        vector[col_index] -= result_entry * elements[entry_index]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_unit_lu_backward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void unit_lu_backward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local  unsigned int col_index_buffer[128]; \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int current_row = size-1; \n");
+  source.append("  unsigned int row_at_window_start = size-1; \n");
+  source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
+  source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
+  source.append("  unsigned int next_row = row_indices[size-1]; \n");
+
+  source.append("  unsigned int i = loop_end + get_local_id(0); \n");
+  source.append("  while (1) \n");
+  source.append("  { \n");
+      //load into shared memory (coalesced access):
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+  source.append("      unsigned int tmp = column_indices[i]; \n");
+  source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+  source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      //now a single thread does the remaining work in shared memory:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("    { \n");
+      // traverse through all the loaded data from back to front:
+  source.append("      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
+  source.append("      { \n");
+  source.append("        unsigned int k = (get_local_size(0) - k2) - 1; \n");
+
+  source.append("        if (i+k >= nnz) \n");
+  source.append("          continue; \n");
+
+  source.append("        if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
+  source.append("          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+  source.append("        else if (col_index_buffer[k] > current_row) \n"); //use buffered data
+  source.append("          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+
+  source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+  source.append("        { \n");
+  source.append("          vector[current_row] = current_vector_entry; \n");
+  source.append("          if (current_row > 0) \n"); //load next row's data
+  source.append("          { \n");
+  source.append("            --current_row; \n");
+  source.append("            next_row = row_indices[current_row]; \n");
+  source.append("            current_vector_entry = vector[current_row]; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+
+
+  source.append("      } \n"); // for k
+
+  source.append("      row_at_window_start = current_row; \n");
+  source.append("    } \n"); // if (get_local_id(0) == 0)
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+
+  source.append("    if (i < get_local_size(0)) \n");
+  source.append("      break; \n");
+
+  source.append("    i -= get_local_size(0); \n");
+  source.append("  } \n"); //for i
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_unit_lu_forward(StringT & source, std::string const & numeric_string)
+{
+
+  // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
+  source.append("__kernel void unit_lu_forward( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  __local  unsigned int col_index_buffer[128]; \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
+  source.append("  __local  "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
+
+  source.append("  unsigned int nnz = row_indices[size]; \n");
+  source.append("  unsigned int current_row = 0; \n");
+  source.append("  unsigned int row_at_window_start = 0; \n");
+  source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
+  source.append("  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
+  source.append("  unsigned int next_row = row_indices[1]; \n");
+
+  source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
+  source.append("  { \n");
+      //load into shared memory (coalesced access):
+  source.append("    if (i < nnz) \n");
+  source.append("    { \n");
+  source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
+  source.append("      unsigned int tmp = column_indices[i]; \n");
+  source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
+  source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
+  source.append("    } \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      //now a single thread does the remaining work in shared memory:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("    { \n");
+        // traverse through all the loaded data:
+  source.append("      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
+  source.append("      { \n");
+  source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
+  source.append("        { \n");
+  source.append("          vector[current_row] = current_vector_entry; \n");
+  source.append("          ++current_row; \n");
+  source.append("          if (current_row < size) //load next row's data \n");
+  source.append("          { \n");
+  source.append("            next_row = row_indices[current_row+1]; \n");
+  source.append("            current_vector_entry = vector[current_row]; \n");
+  source.append("          } \n");
+  source.append("        } \n");
+
+  source.append("        if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
+  source.append("        { \n");
+  source.append("          if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
+  source.append("            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
+  source.append("          else if (col_index_buffer[k] < current_row) \n"); //use buffered data
+  source.append("            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
+  source.append("        } \n");
+
+  source.append("      } \n"); // for k
+
+  source.append("      row_at_window_start = current_row; \n");
+  source.append("    } \n"); // if (get_local_id(0) == 0)
+
+  source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
+  source.append("  } //for i \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul_nvidia(StringT & source, std::string const & numeric_string, unsigned int subwarp_size, bool with_alpha_beta)
+{
+  std::stringstream ss;
+  ss << subwarp_size;
+
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul_nvidia_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul_nvidia( \n");
+  source.append("    __global const unsigned int * row_indices, \n");
+  source.append("    __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * row_blocks, \n");
+  source.append("    __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int num_blocks, \n");
+  source.append("    __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("    uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("    "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("    __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("    uint4 layout_result \n");
+  if (with_alpha_beta) { source.append("    , "); source.append(numeric_string); source.append(" beta \n"); }
+  source.append(") { \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_elements[256]; \n");
+
+  source.append("  const unsigned int id_in_row = get_local_id(0) % " + ss.str() + "; \n");
+  source.append("  const unsigned int block_increment = get_local_size(0) * ((layout_result.z - 1) / (get_global_size(0)) + 1); \n");
+  source.append("  const unsigned int block_start = get_group_id(0) * block_increment; \n");
+  source.append("  const unsigned int block_stop  = min(block_start + block_increment, layout_result.z); \n");
+
+  source.append("  for (unsigned int row  = block_start + get_local_id(0) / " + ss.str() + "; \n");
+  source.append("                    row  < block_stop; \n");
+  source.append("                    row += get_local_size(0) / " + ss.str() + ") \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+  source.append("    for (unsigned int i = row_indices[row] + id_in_row; i < row_end; i += " + ss.str() + ") \n");
+  source.append("      dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+  source.append("    shared_elements[get_local_id(0)] = dot_prod; \n");
+  source.append("    #pragma unroll \n");
+  source.append("    for (unsigned int k = 1; k < " + ss.str() + "; k *= 2) \n");
+  source.append("      shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) ^ k]; \n");
+
+  source.append("    if (id_in_row == 0) \n");
+  if (with_alpha_beta)
+    source.append("      result[row * layout_result.y + layout_result.x] = alpha * shared_elements[get_local_id(0)] + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("      result[row * layout_result.y + layout_result.x] = shared_elements[get_local_id(0)]; \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const unsigned int * row_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  unsigned int num_blocks, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result \n");
+  if (with_alpha_beta) { source.append("  , "); source.append(numeric_string); source.append(" beta \n"); }
+  source.append(") { \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_elements[1024]; \n");
+
+  source.append("  unsigned int row_start = row_blocks[get_group_id(0)]; \n");
+  source.append("  unsigned int row_stop  = row_blocks[get_group_id(0) + 1]; \n");
+  source.append("  unsigned int rows_to_process = row_stop - row_start; \n");
+  source.append("  unsigned int element_start = row_indices[row_start]; \n");
+  source.append("  unsigned int element_stop = row_indices[row_stop]; \n");
+
+  source.append("  if (rows_to_process > 4) { \n"); // CSR stream
+      // load to shared buffer:
+  source.append("    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+  source.append("      shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+      // use one thread per row to sum:
+  source.append("    for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
+  source.append("      "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
+  source.append("      unsigned int thread_row_start = row_indices[row]     - element_start; \n");
+  source.append("      unsigned int thread_row_stop  = row_indices[row + 1] - element_start; \n");
+  source.append("      for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
+  source.append("        dot_prod += shared_elements[i]; \n");
+  if (with_alpha_beta)
+    source.append("      result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("      result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+      // use multiple threads for the summation
+  source.append("  else if (rows_to_process > 1) \n"); // CSR stream with local reduction
+  source.append("  {\n");
+      // load to shared buffer:
+  source.append("    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0))\n");
+  source.append("      shared_elements[i - element_start] = elements[i] * x[column_indices[i] * layout_x.y + layout_x.x];\n");
+
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+    // sum each row separately using a reduction:
+  source.append("    for (unsigned int row = row_start; row < row_stop; ++row)\n");
+  source.append("    {\n");
+  source.append("      unsigned int current_row_start = row_indices[row]     - element_start;\n");
+  source.append("      unsigned int current_row_stop  = row_indices[row + 1] - element_start;\n");
+  source.append("      unsigned int thread_base_id  = current_row_start + get_local_id(0);\n");
+
+      // sum whatever exceeds the current buffer:
+  source.append("      for (unsigned int j = thread_base_id + get_local_size(0); j < current_row_stop; j += get_local_size(0))\n");
+  source.append("        shared_elements[thread_base_id] += shared_elements[j];\n");
+
+      // reduction
+  source.append("      for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n");
+  source.append("      {\n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE);\n");
+  source.append("        if (get_local_id(0) < stride && thread_base_id < current_row_stop)\n");
+  source.append("          shared_elements[thread_base_id] += (thread_base_id + stride < current_row_stop) ? shared_elements[thread_base_id+stride] : 0;\n");
+  source.append("      }\n");
+  source.append("      "); source.append(numeric_string); source.append(" row_result = 0; \n");
+  source.append("      if (current_row_stop > current_row_start)\n");
+  source.append("        row_result = shared_elements[current_row_start]; \n");
+  source.append("      if (get_local_id(0) == 0)\n");
+  if (with_alpha_beta)
+    source.append("        result[row * layout_result.y + layout_result.x] = alpha * row_result + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0);\n");
+  else
+    source.append("        result[row * layout_result.y + layout_result.x] = row_result;\n");
+  source.append("    }\n");
+  source.append("  }\n");
+
+
+  source.append("  else  \n"); // CSR vector for a single row
+  source.append("  { \n");
+      // load and sum to shared buffer:
+  source.append("    shared_elements[get_local_id(0)] = 0; \n");
+  source.append("    for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
+  source.append("      shared_elements[get_local_id(0)] += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
+
+      // reduction to obtain final result
+  source.append("    for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+  source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("      if (get_local_id(0) < stride) \n");
+  source.append("        shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
+  source.append("    } \n");
+
+  source.append("    if (get_local_id(0) == 0) \n");
+  if (with_alpha_beta)
+    source.append("      result[row_start * layout_result.y + layout_result.x] = alpha * shared_elements[0] + ((beta != 0) ? beta * result[row_start * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("      result[row_start * layout_result.y + layout_result.x] = shared_elements[0]; \n");
+  source.append("  } \n");
+
+  source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul4(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul4_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul4( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const uint4 * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append("4 * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result \n");
+  if (with_alpha_beta) { source.append("  , "); source.append(numeric_string); source.append(" beta \n"); }
+  source.append(") { \n");
+  source.append("  "); source.append(numeric_string); source.append(" dot_prod; \n");
+  source.append("  unsigned int start, next_stop; \n");
+  source.append("  uint4 col_idx; \n");
+  source.append("  "); source.append(numeric_string); source.append("4 tmp_vec; \n");
+  source.append("  "); source.append(numeric_string); source.append("4 tmp_entries; \n");
+
+  source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    dot_prod = 0; \n");
+  source.append("    start = row_indices[row] / 4; \n");
+  source.append("    next_stop = row_indices[row+1] / 4; \n");
+
+  source.append("    for (unsigned int i = start; i < next_stop; ++i) \n");
+  source.append("    { \n");
+  source.append("      col_idx = column_indices[i]; \n");
+
+  source.append("      tmp_entries = elements[i]; \n");
+  source.append("      tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
+
+  source.append("      dot_prod += dot(tmp_entries, tmp_vec); \n");
+  source.append("    } \n");
+  if (with_alpha_beta)
+    source.append("    result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul8(StringT & source, std::string const & numeric_string, bool with_alpha_beta)
+{
+  if (with_alpha_beta)
+    source.append("__kernel void vec_mul8_alpha_beta( \n");
+  else
+    source.append("__kernel void vec_mul8( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const uint8 * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append("8 * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * x, \n");
+  source.append("  uint4 layout_x, \n");
+  if (with_alpha_beta) { source.append("  "); source.append(numeric_string); source.append(" alpha, \n"); }
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  uint4 layout_result \n");
+  if (with_alpha_beta) { source.append(" , "); source.append(numeric_string); source.append(" beta \n"); }
+  source.append(") { \n");
+  source.append("  "); source.append(numeric_string); source.append(" dot_prod; \n");
+  source.append("  unsigned int start, next_stop; \n");
+  source.append("  uint8 col_idx; \n");
+  source.append("  "); source.append(numeric_string); source.append("8 tmp_vec; \n");
+  source.append("  "); source.append(numeric_string); source.append("8 tmp_entries; \n");
+
+  source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
+  source.append("  { \n");
+  source.append("    dot_prod = 0; \n");
+  source.append("    start = row_indices[row] / 8; \n");
+  source.append("    next_stop = row_indices[row+1] / 8; \n");
+
+  source.append("    for (unsigned int i = start; i < next_stop; ++i) \n");
+  source.append("    { \n");
+  source.append("      col_idx = column_indices[i]; \n");
+
+  source.append("      tmp_entries = elements[i]; \n");
+  source.append("      tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
+  source.append("      tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
+
+  source.append("      dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
+  source.append("      dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
+  source.append("    } \n");
+  if (with_alpha_beta)
+    source.append("    result[row * layout_result.y + layout_result.x] = alpha * dot_prod + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n");
+  else
+    source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+template<typename StringT>
+void generate_compressed_matrix_vec_mul_cpu(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void vec_mul_cpu( \n");
+  source.append("  __global const unsigned int * row_indices, \n");
+  source.append("  __global const unsigned int * column_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * vector, \n");
+  source.append("  "); source.append(numeric_string); source.append(" alpha, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * result, \n");
+  source.append("  "); source.append(numeric_string); source.append(" beta, \n");
+  source.append("  unsigned int size) \n");
+  source.append("{ \n");
+  source.append("  unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
+  source.append("  unsigned int row_start = get_global_id(0) * work_per_item; \n");
+  source.append("  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
+  source.append("  for (unsigned int row = row_start; row < row_stop; ++row) \n");
+  source.append("  { \n");
+  source.append("    "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
+  source.append("    unsigned int row_end = row_indices[row+1]; \n");
+  source.append("    for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
+  source.append("      dot_prod += elements[i] * vector[column_indices[i]]; \n");
+  source.append("    result[row] = alpha * dot_prod + ((beta != 0) ? beta * result[row] : 0); \n");
+  source.append("  } \n");
+  source.append("} \n");
+
+}
+
+
+
+/** @brief OpenCL kernel for the first stage of sparse matrix-matrix multiplication.
+  *
+  * Each work group derives the maximum of nonzero entries encountered by row in A.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_1(StringT & source)
+{
+  source.append("__kernel void spgemm_stage1( \n");
+  source.append("  __global const unsigned int * A_row_indices, \n");
+  source.append("  __global const unsigned int * A_column_indices, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global unsigned int * group_nnz_array) \n");
+  source.append("{ \n");
+  source.append("  unsigned int work_per_item = max((uint) ((A_size1 - 1) / get_global_size(0) + 1), (uint) 1); \n");
+  source.append("  unsigned int row_start = get_global_id(0) * work_per_item; \n");
+  source.append("  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) A_size1); \n");
+  source.append("  unsigned int max_A_nnz = 0; \n");
+  source.append("  for (unsigned int row = row_start; row < row_stop; ++row) \n");
+  source.append("    max_A_nnz = max(max_A_nnz, A_row_indices[row + 1] - A_row_indices[row]); \n");
+
+    // load and sum to shared buffer:
+  source.append("  __local unsigned int shared_nnz[256]; \n");
+  source.append("  shared_nnz[get_local_id(0)] = max_A_nnz; \n");
+
+    // reduction to obtain final result
+  source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
+  source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("    if (get_local_id(0) < stride) \n");
+  source.append("      shared_nnz[get_local_id(0)] = max(shared_nnz[get_local_id(0)], shared_nnz[get_local_id(0) + stride]); \n");
+  source.append("  } \n");
+
+  source.append("  if (get_local_id(0) == 0) \n");
+  source.append("    group_nnz_array[get_group_id(0)] = shared_nnz[0]; \n");
+  source.append("} \n");
+}
+
+
+/** @brief OpenCL kernel for decomposing A in C = A * B, such that A = A_2 * G_1 with G_1 containing at most 32 nonzeros per row
+  *
+  * Needed for the RMerge split stage.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_decompose_1(StringT & source)
+{
+  source.append("__kernel void spgemm_decompose_1( \n");
+  source.append("  __global const unsigned int * A_row_indices, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  unsigned int max_per_row, \n");
+  source.append("  __global unsigned int * chunks_per_row) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) {\n");
+  source.append("    unsigned int num_entries = A_row_indices[row+1] - A_row_indices[row]; \n");
+  source.append("    chunks_per_row[row] = (num_entries < max_per_row) ? 1 : ((num_entries - 1) / max_per_row + 1); \n");
+  source.append("  } \n");
+  source.append("} \n");
+}
+
+
+/** @brief OpenCL kernel for filling A_2 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonzeros per row
+  *
+  * Needed for the RMerge split stage.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_A2(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void spgemm_A2( \n");
+  source.append("  __global unsigned int *A2_row_indices, \n");
+  source.append("  __global unsigned int *A2_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *A2_elements, \n");
+  source.append("  unsigned int A2_size1, \n");
+  source.append("  __global const unsigned int *new_row_buffer) \n");
+  source.append("{ \n");
+  source.append("  for (unsigned int i = get_global_id(0); i < A2_size1; i += get_global_size(0)) {\n");
+  source.append("    unsigned int index_start = new_row_buffer[i]; \n");
+  source.append("    unsigned int index_stop  = new_row_buffer[i+1]; \n");
+
+  source.append("    A2_row_indices[i] = index_start; \n");
+
+  source.append("    for (unsigned int j = index_start; j < index_stop; ++j) { \n");
+  source.append("      A2_col_indices[j] = j; \n");
+  source.append("      A2_elements[j] = 1; \n");
+  source.append("    } \n");
+  source.append("  } \n");
+
+  source.append("  if (get_global_id(0) == 0) \n");
+  source.append("    A2_row_indices[A2_size1] = new_row_buffer[A2_size1]; \n");
+  source.append("} \n");
+}
+
+/** @brief OpenCL kernel for filling G_1 in the decomposition A = A_2 * G_1, with G_1 containing at most 32 nonzeros per row
+  *
+  * Needed for the RMerge split stage.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_G1(StringT & source, std::string const & numeric_string)
+{
+  source.append("__kernel void spgemm_G1( \n");
+  source.append("  __global unsigned int *G1_row_indices, \n");
+  source.append("  __global unsigned int *G1_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" *G1_elements, \n");
+  source.append("  unsigned int G1_size1, \n");
+  source.append("  __global const unsigned int *A_row_indices, \n");
+  source.append("  __global const unsigned int *A_col_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" *A_elements, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  unsigned int A_nnz, \n");
+  source.append("  unsigned int max_per_row, \n");
+  source.append("  __global const unsigned int *new_row_buffer) \n");
+  source.append("{ \n");
+
+  // Part 1: Copy column indices and entries:
+  source.append("  for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) {\n");
+  source.append("    G1_col_indices[i] = A_col_indices[i]; \n");
+  source.append("    G1_elements[i]    = A_elements[i]; \n");
+  source.append("  } \n");
+
+  // Part 2: Derive new row indices:
+  source.append("  for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) {\n");
+  source.append("    unsigned int old_start = A_row_indices[i]; \n");
+  source.append("    unsigned int new_start = new_row_buffer[i]; \n");
+  source.append("    unsigned int row_chunks = new_row_buffer[i+1] - new_start; \n");
+
+  source.append("    for (unsigned int j=0; j<row_chunks; ++j) \n");
+  source.append("      G1_row_indices[new_start + j] = old_start + j * max_per_row; \n");
+  source.append("  } \n");
+
+  // write last entry in row_buffer with thread 0:
+  source.append("  if (get_global_id(0) == 0) \n");
+  source.append("    G1_row_indices[G1_size1] = A_row_indices[A_size1]; \n");
+  source.append("} \n");
+}
+
+
+
+/** @brief OpenCL kernel for the second stage of sparse matrix-matrix multiplication.
+  *
+  * Computes the exact sparsity pattern of A*B.
+  * Result array C_row_indices contains number of nonzeros in each row.
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_2(StringT & source)
+{
+  source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
+  source.append("__kernel void spgemm_stage2( \n");
+  source.append("  __global const unsigned int * A_row_indices, \n");
+  source.append("  __global const unsigned int * A_col_indices, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global const unsigned int * B_row_indices, \n");
+  source.append("  __global const unsigned int * B_col_indices, \n");
+  source.append("  unsigned int B_size2, \n");
+  source.append("  __global unsigned int * C_row_indices) \n");
+  source.append("{ \n");
+  source.append("  unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
+  source.append("  unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
+  source.append("  unsigned int row_C_stop  = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
+  source.append("  __local unsigned int shared_front[32]; \n");
+
+  source.append("  for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_A_start = A_row_indices[row_C]; \n");
+  source.append("    unsigned int row_A_end   = A_row_indices[row_C+1]; \n");
+
+  source.append("    unsigned int my_row_B = row_A_start + get_local_id(0); \n");
+  source.append("    unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B] : 0; \n");
+  source.append("    unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index] : 0; \n");
+  source.append("    unsigned int row_B_end   = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
+
+  source.append("    unsigned int num_nnz = 0; \n");
+  source.append("    if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
+
+  source.append("      unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+  source.append("      while (1) { \n");
+
+  // determine minimum index via reduction:
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        shared_front[get_local_id(0)] = current_front_index; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 16) shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 16]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 8)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 8]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 4)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 4]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 2)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 2]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (get_local_id(0) < 1)  shared_front[get_local_id(0)] = min(shared_front[get_local_id(0)], shared_front[get_local_id(0) + 1]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("        if (shared_front[0] == B_size2) break; \n");
+
+  // update front:
+  source.append("        if (current_front_index == shared_front[0]) { \n");
+  source.append("          ++row_B_start; \n");
+  source.append("          current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+  source.append("        } \n");
+
+  source.append("        ++num_nnz;  \n");
+  source.append("      }  \n");
+  source.append("    } else { num_nnz = row_B_end - row_B_start; }\n");
+
+  // write number of entries found:
+  source.append("    if (get_local_id(0) == 0) \n");
+  source.append("      C_row_indices[row_C] = num_nnz; \n");
+
+  source.append("  } \n");
+
+  source.append("} \n");
+
+}
+
+
+/** @brief OpenCL kernel for the third stage of sparse matrix-matrix multiplication.
+  *
+  * Computes A*B into C with known sparsity pattern (obtained from stages 1 and 2).
+  **/
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod_3(StringT & source, std::string const & numeric_string)
+{
+  source.append("__attribute__((reqd_work_group_size(32, 1, 1))) \n");
+  source.append("__kernel void spgemm_stage3( \n");
+  source.append("  __global const unsigned int * A_row_indices, \n");
+  source.append("  __global const unsigned int * A_col_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * A_elements, \n");
+  source.append("  unsigned int A_size1, \n");
+  source.append("  __global const unsigned int * B_row_indices, \n");
+  source.append("  __global const unsigned int * B_col_indices, \n");
+  source.append("  __global const "); source.append(numeric_string); source.append(" * B_elements, \n");
+  source.append("  unsigned int B_size2, \n");
+  source.append("  __global unsigned int * C_row_indices, \n");
+  source.append("  __global unsigned int * C_col_indices, \n");
+  source.append("  __global "); source.append(numeric_string); source.append(" * C_elements) \n");
+  source.append("{ \n");
+  source.append("  unsigned int work_per_group = max((uint) ((A_size1 - 1) / get_num_groups(0) + 1), (uint) 1); \n");
+  source.append("  unsigned int row_C_start = get_group_id(0) * work_per_group; \n");
+  source.append("  unsigned int row_C_stop  = min( (uint) ((get_group_id(0) + 1) * work_per_group), (uint) A_size1); \n");
+  source.append("  __local unsigned int shared_front[32]; \n");
+  source.append("  __local "); source.append(numeric_string); source.append(" shared_front_values[32]; \n");
+  source.append("  unsigned int local_id = get_local_id(0); \n");
+
+  source.append("  for (unsigned int row_C = row_C_start; row_C < row_C_stop; ++row_C) \n");
+  source.append("  { \n");
+  source.append("    unsigned int row_A_start = A_row_indices[row_C]; \n");
+  source.append("    unsigned int row_A_end   = A_row_indices[row_C+1]; \n");
+
+  source.append("    unsigned int my_row_B = row_A_start + ((row_A_end - row_A_start > 1) ? local_id : 0); \n"); // single row is a special case
+  source.append("    unsigned int row_B_index = (my_row_B < row_A_end) ? A_col_indices[my_row_B]        : 0; \n");
+  source.append("    unsigned int row_B_start = (my_row_B < row_A_end) ? B_row_indices[row_B_index]     : 0; \n");
+  source.append("    unsigned int row_B_end   = (my_row_B < row_A_end) ? B_row_indices[row_B_index + 1] : 0; \n");
+
+  source.append("    "); source.append(numeric_string); source.append(" val_A = (my_row_B < row_A_end) ? A_elements[my_row_B] : 0; \n");
+  source.append("    unsigned int index_in_C = C_row_indices[row_C] + local_id; \n");
+
+  source.append("    if (row_A_end - row_A_start > 1) { \n"); // zero or no row can be processed faster
+
+  source.append("      unsigned int current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+  source.append("      "); source.append(numeric_string); source.append(" current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start]    : 0; \n");
+
+  source.append("      unsigned int index_buffer = 0; \n");
+  source.append("      "); source.append(numeric_string); source.append(" value_buffer = 0; \n");
+  source.append("      unsigned int buffer_size = 0; \n");
+
+  source.append("      while (1) { \n");
+
+  // determine minimum index via reduction:
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        shared_front[local_id] = current_front_index; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 16) shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 16]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 8)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 8]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 4)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 4]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 2)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 2]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 1)  shared_front[local_id] = min(shared_front[local_id], shared_front[local_id + 1]); \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  source.append("        if (shared_front[0] == B_size2) break; \n");
+
+  // compute output value via reduction:
+  source.append("        shared_front_values[local_id] = (current_front_index == shared_front[0]) ? val_A * current_front_value : 0; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 16) shared_front_values[local_id] += shared_front_values[local_id + 16]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 8)  shared_front_values[local_id] += shared_front_values[local_id + 8]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 4)  shared_front_values[local_id] += shared_front_values[local_id + 4]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 2)  shared_front_values[local_id] += shared_front_values[local_id + 2]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+  source.append("        if (local_id < 1)  shared_front_values[local_id] += shared_front_values[local_id + 1]; \n");
+  source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+  // update front:
+  source.append("        if (current_front_index == shared_front[0]) { \n");
+  source.append("          ++row_B_start; \n");
+  source.append("          current_front_index = (row_B_start < row_B_end) ? B_col_indices[row_B_start] : B_size2; \n");
+  source.append("          current_front_value = (row_B_start < row_B_end) ? B_elements[row_B_start]    : 0; \n");
+  source.append("        } \n");
+
+  // write current front to register buffer:
+  source.append("        index_buffer = (local_id == buffer_size) ? shared_front[0]        : index_buffer;  \n");
+  source.append("        value_buffer = (local_id == buffer_size) ? shared_front_values[0] : value_buffer;  \n");
+  source.append("        ++buffer_size;  \n");
+
+  // flush register buffer via a coalesced write once full:
+  source.append("        if (buffer_size == get_local_size(0)) {  \n");
+  source.append("          C_col_indices[index_in_C] = index_buffer; \n");
+  source.append("          C_elements[index_in_C]    = value_buffer; \n");
+  source.append("        } \n");
+
+  // the following should be in the previous if-conditional, but a bug in NVIDIA drivers 34x.yz requires this slight rewrite
+  source.append("          index_in_C += (buffer_size == get_local_size(0)) ? get_local_size(0) : 0; \n");
+  source.append("          buffer_size = (buffer_size == get_local_size(0)) ?                0  : buffer_size; \n");
+
+  source.append("      }  \n");
+
+  // write remaining entries in register buffer:
+  source.append("      if (local_id < buffer_size) {  \n");
+  source.append("        C_col_indices[index_in_C] = index_buffer; \n");
+  source.append("        C_elements[index_in_C]    = value_buffer; \n");
+  source.append("      } \n");
+
+  // copy to C in coalesced manner:
+  source.append("    } else { \n");
+  source.append("      for (unsigned int i = row_B_start + local_id; i < row_B_end; i += get_local_size(0)) { \n");
+  source.append("        C_col_indices[index_in_C] = B_col_indices[i]; \n");
+  source.append("        C_elements[index_in_C]    = val_A * B_elements[i]; \n");
+  source.append("        index_in_C += get_local_size(0); \n");
+  source.append("      } \n");
+  source.append("    } \n");
+
+  source.append("  } \n");
+
+  source.append("} \n");
+
+}
+
+
+template<typename StringT>
+void generate_compressed_matrix_compressed_matrix_prod(StringT & source, std::string const & numeric_string)
+{
+  generate_compressed_matrix_compressed_matrix_prod_1(source);
+  generate_compressed_matrix_compressed_matrix_prod_decompose_1(source);
+  generate_compressed_matrix_compressed_matrix_prod_A2(source, numeric_string);
+  generate_compressed_matrix_compressed_matrix_prod_G1(source, numeric_string);
+  generate_compressed_matrix_compressed_matrix_prod_2(source);
+  generate_compressed_matrix_compressed_matrix_prod_3(source, numeric_string);
+}
+
+template<typename StringT>
+void generate_compressed_matrix_assign_to_dense(StringT & source, std::string const & numeric_string)
+{
+
+ source.append(" __kernel void assign_to_dense( \n");
+ source.append("  __global const unsigned int * row_indices, \n");
+ source.append("  __global const unsigned int * column_indices, \n");
+ source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
+ source.append("  __global "); source.append(numeric_string); source.append(" * B, \n");
+ source.append("  unsigned int B_row_start, \n");
+ source.append("  unsigned int B_col_start, \n");
+ source.append("  unsigned int B_row_inc, \n");
+ source.append("  unsigned int B_col_inc, \n");
+ source.append("  unsigned int B_row_size, \n");
+ source.append("  unsigned int B_col_size, \n");
+ source.append("  unsigned int B_internal_rows, \n");
+ source.append("  unsigned int B_internal_cols) { \n");
+
+ source.append("   for (unsigned int i = get_global_id(0); i < B_row_size; i += get_global_size(0)) \n");
+ source.append("   { \n");
+ source.append("     unsigned int row_end = row_indices[i+1]; \n");
+ source.append("     for (unsigned int j = row_indices[i]; j<row_end; j++) \n");
+ source.append("     { \n");
+ source.append("       B[(B_row_start + i * B_row_inc) * B_internal_cols + B_col_start + column_indices[j] * B_col_inc] = elements[j]; \n");
+ source.append("     } \n");
+ source.append("   } \n");
+ source.append("  } \n");
+
+}
+
+
+//////////////////////////// Part 2: Main kernel class ////////////////////////////////////
+
+// main kernel class
+/** @brief Main kernel class for generating OpenCL kernels for compressed_matrix (except solvers). */
+template<typename NumericT>
+struct compressed_matrix
+{
+  static std::string program_name()
+  {
+    return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix";
+  }
+
+  static void init(viennacl::ocl::context & ctx)
+  {
+    static std::map<cl_context, bool> init_done;
+    if (!init_done[ctx.handle().get()])
+    {
+      viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
+      std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
+
+      std::string source;
+      source.reserve(1024);
+
+      viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
+
+      if (numeric_string == "float" || numeric_string == "double")
+      {
+        generate_compressed_matrix_jacobi(source, numeric_string);
+      }
+      generate_compressed_matrix_dense_matrix_multiplication(source, numeric_string);
+      generate_compressed_matrix_row_info_extractor(source, numeric_string);
+      if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id)
+      {
+        generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16, true);
+        generate_compressed_matrix_vec_mul_nvidia(source, numeric_string, 16, false);
+      }
+      gene

<TRUNCATED>