You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:09 UTC
[12/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
new file mode 100644
index 0000000..64c12b0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
@@ -0,0 +1,2645 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
+#define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/opencl/kernels/bisect.hpp
+    @brief OpenCL kernels for the bisection algorithm for eigenvalues
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+    the creation of derivative works is allowed by including the following statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+// declaration, forward
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+  template <typename StringType>
+  void generate_bisect_kernel_config(StringType & source)
+  {
+    /* Global configuration parameter */
+    source.append("     #define  VIENNACL_BISECT_MAX_THREADS_BLOCK                256\n");
+    source.append("     #define  VIENNACL_BISECT_MAX_SMALL_MATRIX                 256\n");
+    source.append("     #define  VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX   256\n");
+    source.append("     #define  VIENNACL_BISECT_MIN_ABS_INTERVAL                 5.0e-37\n");
+
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute the next lower power of two of n
+  // n    number for which next higher power of two is seeked
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_floorPow2(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline int  \n");
+  source.append("     floorPow2(int n)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0);  \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+      // early out if already power of two
+  source.append("         if (0 == (n & (n-1)))  \n");
+  source.append("         {  \n");
+  source.append("             return n;  \n");
+  source.append("         }  \n");
+
+  source.append("         int exp;  \n");
+  source.append("         frexp(( "); source.append(numeric_string); source.append(" )n, &exp);  \n");
+  source.append("         return (1 << (exp - 1));  \n");
+  source.append("     }  \n");
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute the next higher power of two of n
+  // n  number for which next higher power of two is seeked
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_ceilPow2(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline int  \n");
+  source.append("     ceilPow2(int n)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+      // early out if already power of two
+  source.append("         if (0 == (n & (n-1)))  \n");
+  source.append("         {  \n");
+  source.append("             return n;  \n");
+  source.append("         }  \n");
+
+  source.append("         int exp;  \n");
+  source.append("         frexp(( "); source.append(numeric_string); source.append(" )n, &exp);  \n");
+  source.append("         return (1 << exp);  \n");
+  source.append("     }  \n");
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute midpoint of interval [\a left, \a right] avoiding overflow if possible
+  //
+  // left     left  / lower limit of interval
+  // right    right / upper limit of interval
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_computeMidpoint(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline "); source.append(numeric_string); source.append(" \n");
+  source.append("     computeMidpoint(const "); source.append(numeric_string); source.append(" left,\n");
+  source.append("       const "); source.append(numeric_string); source.append("  right)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+  source.append("          "); source.append(numeric_string); source.append("  mid;  \n");
+
+  source.append("         if (sign(left) == sign(right))  \n");
+  source.append("         {  \n");
+  source.append("             mid = left + (right - left) * 0.5f;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+  source.append("             mid = (left + right) * 0.5f;  \n");
+  source.append("         }  \n");
+
+  source.append("         return mid;  \n");
+  source.append("     }  \n");
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Check if interval converged and store appropriately
+  //
+  // addr           address where to store the information of the interval
+  // s_left         shared memory storage for left interval limits
+  // s_right        shared memory storage for right interval limits
+  // s_left_count   shared memory storage for number of eigenvalues less than left interval limits
+  // s_right_count  shared memory storage for number of eigenvalues less than right interval limits
+  // left           lower limit of interval
+  // right          upper limit of interval
+  // left_count     eigenvalues less than \a left
+  // right_count    eigenvalues less than \a right
+  // precision      desired precision for eigenvalues
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeInterval(StringType & source, std::string const & numeric_string)
+  {
+  source.append("     \n");
+  source.append("     void  \n");
+  source.append("     storeInterval(unsigned int addr,  \n");
+  source.append("                   __local "); source.append(numeric_string); source.append(" * s_left,   \n");
+  source.append("                   __local "); source.append(numeric_string); source.append(" * s_right,  \n");
+  source.append("                   __local unsigned int * s_left_count,  \n");
+  source.append("                   __local unsigned int * s_right_count,  \n");
+  source.append("                    "); source.append(numeric_string); source.append(" left,   \n");
+  source.append("                    "); source.append(numeric_string); source.append(" right,  \n");
+  source.append("                   unsigned int left_count, \n");
+  source.append("                   unsigned int right_count,  \n");
+  source.append("                    "); source.append(numeric_string); source.append("  precision)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         s_left_count[addr] = left_count;  \n");
+  source.append("         s_right_count[addr] = right_count;  \n");
+
+      // check if interval converged
+  source.append("          "); source.append(numeric_string); source.append(" t0 = fabs(right - left);  \n");
+  source.append("          "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision;  \n");
+
+  source.append("         if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1))  \n");
+  source.append("         {  \n");
+          // compute mid point
+  source.append("              "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right);  \n");
+
+          // mark as converged
+  source.append("             s_left[addr] = lambda;  \n");
+  source.append("             s_right[addr] = lambda;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // store current limits
+  source.append("             s_left[addr] = left;  \n");
+  source.append("             s_right[addr] = right;  \n");
+  source.append("         }  \n");
+
+  source.append("     }  \n");
+
+  }
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeIntervalShort(StringType & source, std::string const & numeric_string)
+  {
+  source.append("     \n");
+  source.append("     void  \n");
+  source.append("     storeIntervalShort(unsigned int addr,  \n");
+  source.append("                   __local "); source.append(numeric_string); source.append(" * s_left,   \n");
+  source.append("                   __local "); source.append(numeric_string); source.append(" * s_right,  \n");
+  source.append("                   __local unsigned short * s_left_count,  \n");
+  source.append("                   __local unsigned short * s_right_count,  \n");
+  source.append("                    "); source.append(numeric_string); source.append(" left,   \n");
+  source.append("                    "); source.append(numeric_string); source.append(" right,  \n");
+  source.append("                   unsigned int left_count, \n");
+  source.append("                   unsigned int right_count,  \n");
+  source.append("                    "); source.append(numeric_string); source.append("  precision)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         s_left_count[addr] = left_count;  \n");
+  source.append("         s_right_count[addr] = right_count;  \n");
+
+      // check if interval converged
+  source.append("          "); source.append(numeric_string); source.append(" t0 = fabs(right - left);  \n");
+  source.append("          "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision;  \n");
+
+  source.append("         if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1))  \n");
+  source.append("         {  \n");
+          // compute mid point
+  source.append("              "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right);  \n");
+
+          // mark as converged
+  source.append("             s_left[addr] = lambda;  \n");
+  source.append("             s_right[addr] = lambda;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // store current limits
+  source.append("             s_left[addr] = left;  \n");
+  source.append("             s_right[addr] = right;  \n");
+  source.append("         }  \n");
+
+  source.append("     }  \n");
+
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute number of eigenvalues that are smaller than x given a symmetric,
+  // real, and tridiagonal matrix
+  //
+  // g_d                   diagonal elements stored in global memory
+  // g_s                   superdiagonal elements stored in global memory
+  // n                     size of matrix
+  // x                     value for which the number of eigenvalues that are smaller is sought
+  // tid                   thread identified (e.g. threadIdx.x or gtid)
+  // num_intervals_active  number of active intervals / threads that currently process an interval
+  // s_d                   scratch space to store diagonal entries of the tridiagonal matrix in shared memory
+  // s_s                   scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
+  // converged             flag if the current thread is already converged (that is count does not have to be computed)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline unsigned int  \n");
+  source.append("     computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+  source.append("                                __global "); source.append(numeric_string); source.append(" *g_s,   \n");
+  source.append("                                const unsigned int n,  \n");
+  source.append("                                const "); source.append(numeric_string); source.append(" x,         \n");
+  source.append("                                const unsigned int tid,  \n");
+  source.append("                                const unsigned int num_intervals_active,  \n");
+  source.append("                                __local "); source.append(numeric_string); source.append(" *s_d,  \n");
+  source.append("                                __local "); source.append(numeric_string); source.append(" *s_s,  \n");
+  source.append("                                unsigned int converged  \n");
+  source.append("                               )  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+  source.append("          "); source.append(numeric_string); source.append(" delta = 1.0f;  \n");
+  source.append("         unsigned int count = 0;  \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      // read data into shared memory
+  source.append("         if (lcl_id < n)  \n");
+  source.append("         {  \n");
+  source.append("             s_d[lcl_id] = *(g_d + lcl_id);  \n");
+  source.append("             s_s[lcl_id] = *(g_s + lcl_id - 1);  \n");
+  source.append("         }  \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      // perform loop only for active threads
+  source.append("         if ((tid < num_intervals_active) && (0 == converged))  \n");
+  source.append("         {  \n");
+
+          // perform (optimized) Gaussian elimination to determine the number
+          // of eigenvalues that are smaller than n
+  source.append("             for (unsigned int k = 0; k < n; ++k)  \n");
+  source.append("             {  \n");
+  source.append("                 delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;  \n");
+  source.append("                 count += (delta < 0) ? 1 : 0;  \n");
+  source.append("             }  \n");
+
+  source.append("         } \n"); // end if thread currently processing an interval
+
+  source.append("         return count;  \n");
+  source.append("     }  \n");
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Compute number of eigenvalues that are smaller than x given a symmetric,
+  // real, and tridiagonal matrix
+  //
+  // g_d                   diagonal elements stored in global memory
+  // g_s                   superdiagonal elements stored in global memory
+  // n                     size of matrix
+  // x                     value for which the number of eigenvalues that are smaller is seeked
+  // tid                   thread identified (e.g. threadIdx.x or gtid)
+  // num_intervals_active  number of active intervals / threads that currently process an interval
+  // s_d                   scratch space to store diagonal entries of the tridiagonal matrix in shared memory
+  // s_s                   scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
+  // converged             flag if the current thread is already converged (that is count does not have to be computed)
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     inline unsigned int  \n");
+  source.append("     computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+  source.append("                                __global "); source.append(numeric_string); source.append(" *g_s,   \n");
+  source.append("                                const unsigned int n,  \n");
+  source.append("                                const "); source.append(numeric_string); source.append(" x,         \n");
+  source.append("                                const unsigned int tid,  \n");
+  source.append("                                const unsigned int num_intervals_active,  \n");
+  source.append("                                __local "); source.append(numeric_string); source.append(" *s_d,  \n");
+  source.append("                                __local "); source.append(numeric_string); source.append(" *s_s,  \n");
+  source.append("                                unsigned int converged  \n");
+  source.append("                               )  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("          "); source.append(numeric_string); source.append(" delta = 1.0f;  \n");
+  source.append("         unsigned int count = 0;  \n");
+
+  source.append("         unsigned int rem = n;  \n");
+
+      // do until whole diagonal and superdiagonal has been loaded and processed
+  source.append("         for (unsigned int i = 0; i < n; i += lcl_sz)  \n");
+  source.append("         {  \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+          // read new chunk of data into shared memory
+  source.append("             if ((i + lcl_id) < n)  \n");
+  source.append("             {  \n");
+
+  source.append("                 s_d[lcl_id] = *(g_d + i + lcl_id);  \n");
+  source.append("                 s_s[lcl_id] = *(g_s + i + lcl_id - 1);  \n");
+  source.append("             }  \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+
+  source.append("             if (tid < num_intervals_active)  \n");
+  source.append("             {  \n");
+
+              // perform (optimized) Gaussian elimination to determine the number
+              // of eigenvalues that are smaller than n
+  source.append("                 for (unsigned int k = 0; k < min(rem,lcl_sz); ++k)  \n");
+  source.append("                 {  \n");
+  source.append("                     delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta;  \n");
+                  // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
+  source.append("                     count += (delta < 0) ? 1 : 0;  \n");
+  source.append("                 }  \n");
+
+  source.append("             } \n"); // end if thread currently processing an interval
+
+  source.append("             rem -= lcl_sz;  \n");
+  source.append("         }  \n");
+
+  source.append("         return count;  \n");
+  source.append("     }  \n");
+
+
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Store all non-empty intervals resulting from the subdivision of the interval
+  // currently processed by the thread
+  //
+  // addr                     base address for storing intervals
+  // num_threads_active       number of threads / intervals in current sweep
+  // s_left                   shared memory storage for left interval limits
+  // s_right                  shared memory storage for right interval limits
+  // s_left_count             shared memory storage for number of eigenvalues less than left interval limits
+  // s_right_count            shared memory storage for number of eigenvalues less than right interval limits
+  // left                     lower limit of interval
+  // mid                      midpoint of interval
+  // right                    upper limit of interval
+  // left_count               eigenvalues less than \a left
+  // mid_count                eigenvalues less than \a mid
+  // right_count              eigenvalues less than \a right
+  // precision                desired precision for eigenvalues
+  // compact_second_chunk     shared mem flag if second chunk is used and ergo requires compaction
+  // s_compaction_list_exc    helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
+  // is_active_interval       mark is thread has a second non-empty child interval
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     storeNonEmptyIntervals(unsigned int addr,  \n");
+  source.append("                            const unsigned int num_threads_active,  \n");
+  source.append("                            __local "); source.append(numeric_string); source.append(" *s_left,   \n");
+  source.append("                            __local "); source.append(numeric_string); source.append(" *s_right,  \n");
+  source.append("                            __local unsigned int *s_left_count,  \n");
+  source.append("                            __local unsigned int *s_right_count,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" left, \n ");
+  source.append("                             "); source.append(numeric_string); source.append(" mid,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" right,\n");
+  source.append("                            const unsigned int left_count,  \n");
+  source.append("                            const unsigned int mid_count,  \n");
+  source.append("                            const unsigned int right_count,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" precision,  \n");
+  source.append("                            __local unsigned int *compact_second_chunk,  \n");
+  source.append("                            __local unsigned int *s_compaction_list_exc,  \n");
+  source.append("                            unsigned int *is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+      // check if both child intervals are valid
+  source.append("          \n");
+  source.append("         if ((left_count != mid_count) && (mid_count != right_count))  \n");
+  source.append("         {  \n");
+
+          // store the left interval
+  source.append("             storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+  source.append("                           left, mid, left_count, mid_count, precision);  \n");
+
+          // mark that a second interval has been generated, only stored after
+          // stream compaction of second chunk
+  source.append("             *is_active_second = 1;  \n");
+  source.append("             s_compaction_list_exc[lcl_id] = 1;  \n");
+  source.append("             *compact_second_chunk = 1;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // only one non-empty child interval
+
+          // mark that no second child
+  source.append("             *is_active_second = 0;  \n");
+  source.append("             s_compaction_list_exc[lcl_id] = 0;  \n");
+
+          // store the one valid child interval
+  source.append("             if (left_count != mid_count)  \n");
+  source.append("             {  \n");
+  source.append("                 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+  source.append("                               left, mid, left_count, mid_count, precision);  \n");
+  source.append("             }  \n");
+  source.append("             else  \n");
+  source.append("             {  \n");
+  source.append("                 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+  source.append("                               mid, right, mid_count, right_count, precision);  \n");
+  source.append("             }  \n");
+
+  source.append("         }  \n");
+  source.append("     }  \n");
+
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  //! Store all non-empty intervals resulting from the subdivision of the interval
+  //! currently processed by the thread
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, std::string const & numeric_string)
+  {
+      source.append("       \n");
+      source.append("     void  \n");
+      source.append("     storeNonEmptyIntervalsLarge(unsigned int addr,  \n");
+      source.append("                            const unsigned int num_threads_active,  \n");
+      source.append("                            __local "); source.append(numeric_string); source.append(" *s_left,   \n");
+      source.append("                            __local "); source.append(numeric_string); source.append(" *s_right,  \n");
+      source.append("                            __local unsigned short *s_left_count,  \n");
+      source.append("                            __local unsigned short *s_right_count,  \n");
+      source.append("                             "); source.append(numeric_string); source.append(" left, \n ");
+      source.append("                             "); source.append(numeric_string); source.append(" mid,  \n");
+      source.append("                             "); source.append(numeric_string); source.append(" right,\n");
+      source.append("                            const unsigned int left_count,  \n");
+      source.append("                            const unsigned int mid_count,  \n");
+      source.append("                            const unsigned int right_count,  \n");
+      source.append("                             "); source.append(numeric_string); source.append(" epsilon,  \n");
+      source.append("                            __local unsigned int *compact_second_chunk,  \n");
+      source.append("                            __local unsigned short *s_compaction_list,  \n");
+      source.append("                            unsigned int *is_active_second)  \n");
+      source.append("     {  \n");
+      source.append("         uint glb_id = get_global_id(0); \n");
+      source.append("         uint grp_id = get_group_id(0); \n");
+      source.append("         uint grp_nm = get_num_groups(0); \n");
+      source.append("         uint lcl_id = get_local_id(0); \n");
+      source.append("         uint lcl_sz = get_local_size(0); \n");
+
+          // check if both child intervals are valid
+      source.append("         if ((left_count != mid_count) && (mid_count != right_count))  \n");
+      source.append("         {  \n");
+
+      source.append("             storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                           left, mid, left_count, mid_count, epsilon);  \n");
+
+      source.append("             *is_active_second = 1;  \n");
+      source.append("             s_compaction_list[lcl_id] = 1;  \n");
+      source.append("             *compact_second_chunk = 1;  \n");
+      source.append("         }  \n");
+      source.append("         else  \n");
+      source.append("         {  \n");
+
+              // only one non-empty child interval
+
+              // mark that no second child
+      source.append("             *is_active_second = 0;  \n");
+      source.append("             s_compaction_list[lcl_id] = 0;  \n");
+
+              // store the one valid child interval
+      source.append("             if (left_count != mid_count)  \n");
+      source.append("             {  \n");
+      source.append("                 storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                               left, mid, left_count, mid_count, epsilon);  \n");
+      source.append("             }  \n");
+      source.append("             else  \n");
+      source.append("             {  \n");
+      source.append("                 storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                               mid, right, mid_count, right_count, epsilon);  \n");
+      source.append("             }  \n");
+      source.append("         }  \n");
+      source.append("     }  \n");
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Create indices for compaction, that is process \a s_compaction_list_exc
+  // which is 1 for intervals that generated a second child and 0 otherwise
+  // and create for each of the non-zero elements the index where the new
+  // interval belongs to in a compact representation of all generated second children
+  //
+  // s_compaction_list_exc      list containing the flags which threads generated two children
+  // num_threads_compaction     number of threads to employ for compaction
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template<typename StringType>
+  void generate_bisect_kernel_createIndicesCompaction(StringType & source)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     createIndicesCompaction(__local unsigned int *s_compaction_list_exc,  \n");
+  source.append("                             unsigned int num_threads_compaction)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+  source.append("         unsigned int offset = 1;  \n");
+  source.append("         const unsigned int tid = lcl_id;  \n");
+     // if(tid == 0)
+       // printf("num_threads_compaction = %u\n", num_threads_compaction);
+
+      // higher levels of scan tree
+  source.append("         for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)  \n");
+  source.append("         {  \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("             if (tid < d)  \n");
+  source.append("             {  \n");
+
+  source.append("                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
+  source.append("                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
+  source.append("              \n");
+  source.append("                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
+  source.append("                                               + s_compaction_list_exc[ai];  \n");
+  source.append("             }  \n");
+
+  source.append("             offset <<= 1;  \n");
+  source.append("         }  \n");
+
+      // traverse down tree: first down to level 2 across
+  source.append("         for (int d = 2; d < num_threads_compaction; d <<= 1)  \n");
+  source.append("         {  \n");
+
+  source.append("             offset >>= 1;  \n");
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("             if (tid < (d-1))  \n");
+  source.append("             {  \n");
+
+  source.append("                 unsigned int  ai = offset*(tid+1) - 1;  \n");
+  source.append("                 unsigned int  bi = ai + (offset >> 1);  \n");
+
+  source.append("                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
+  source.append("                                               + s_compaction_list_exc[ai];  \n");
+  source.append("             }  \n");
+  source.append("         }  \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("     }  \n");
+  }
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_createIndicesCompactionShort(StringType & source)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc,  \n");
+  source.append("                             unsigned int num_threads_compaction)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+
+  source.append("         unsigned int offset = 1;  \n");
+  source.append("         const unsigned int tid = lcl_id;  \n");
+
+      // higher levels of scan tree
+  source.append("         for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)  \n");
+  source.append("         {  \n");
+
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("             if (tid < d)  \n");
+  source.append("             {  \n");
+
+  source.append("                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
+  source.append("                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
+  source.append("              \n");
+  source.append("                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
+  source.append("                                               + s_compaction_list_exc[ai];  \n");
+  source.append("             }  \n");
+
+  source.append("             offset <<= 1;  \n");
+  source.append("         }  \n");
+
+      // traverse down tree: first down to level 2 across
+  source.append("         for (int d = 2; d < num_threads_compaction; d <<= 1)  \n");
+  source.append("         {  \n");
+
+  source.append("             offset >>= 1;  \n");
+  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("             if (tid < (d-1))  \n");
+  source.append("             {  \n");
+
+  source.append("                 unsigned int  ai = offset*(tid+1) - 1;  \n");
+  source.append("                 unsigned int  bi = ai + (offset >> 1);  \n");
+
+  source.append("                 s_compaction_list_exc[bi] =   s_compaction_list_exc[bi]  \n");
+  source.append("                                               + s_compaction_list_exc[ai];  \n");
+  source.append("             }  \n");
+  source.append("         }  \n");
+
+  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+  source.append("     }  \n");
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Perform stream compaction for second child intervals
+  //
+  // s_left              shared memory storage for left interval limits
+  // s_right             shared memory storage for right interval limits
+  // s_left_count        shared memory storage for number of eigenvalues less than left interval limits
+  // s_right_count       shared memory storage for number of eigenvalues less than right interval limits
+  // mid                 midpoint of current interval (left of new interval)
+  // right               upper limit of interval
+  // mid_count           eigenvalues less than \a mid
+  // s_compaction_list   list containing the indices where the data has to be stored
+  // num_threads_active  number of active threads / intervals
+  // is_active_interval  mark is thread has a second non-empty child interval
+  ///////////////////////////////////////////////////////////////////////////////
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_compactIntervals(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     compactIntervals(__local "); source.append(numeric_string); source.append(" *s_left,  \n");
+  source.append("                      __local "); source.append(numeric_string); source.append(" *s_right, \n");
+  source.append("                      __local unsigned int *s_left_count, \n");
+  source.append("                      __local unsigned int *s_right_count,  \n");
+  source.append("                       "); source.append(numeric_string); source.append(" mid,  \n");
+  source.append("                       "); source.append(numeric_string); source.append(" right, \n");
+  source.append("                      unsigned int mid_count, unsigned int right_count,  \n");
+  source.append("                      __local unsigned int *s_compaction_list,  \n");
+  source.append("                      unsigned int num_threads_active,  \n");
+  source.append("                      unsigned int is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         const unsigned int tid = lcl_id;  \n");
+
+      // perform compaction / copy data for all threads where the second
+      // child is not dead
+  source.append("         if ((tid < num_threads_active) && (1 == is_active_second))  \n");
+  source.append("         {  \n");
+  source.append("             unsigned int addr_w = num_threads_active + s_compaction_list[tid];  \n");
+  source.append("             s_left[addr_w] = mid;  \n");
+  source.append("             s_right[addr_w] = right;  \n");
+  source.append("             s_left_count[addr_w] = mid_count;  \n");
+  source.append("             s_right_count[addr_w] = right_count;  \n");
+  source.append("         }  \n");
+  source.append("     }  \n");
+  }
+
+
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_compactIntervalsShort(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     compactIntervalsShort(__local "); source.append(numeric_string); source.append(" *s_left,  \n");
+  source.append("                      __local "); source.append(numeric_string); source.append(" *s_right,  \n");
+  source.append("                      __local unsigned short *s_left_count, \n");
+  source.append("                      __local unsigned short *s_right_count,  \n");
+  source.append("                      "); source.append(numeric_string); source.append(" mid,   \n");
+  source.append("                      "); source.append(numeric_string); source.append(" right, \n");
+  source.append("                      unsigned int mid_count, unsigned int right_count,  \n");
+  source.append("                      __local unsigned short *s_compaction_list,  \n");
+  source.append("                      unsigned int num_threads_active,  \n");
+  source.append("                      unsigned int is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         const unsigned int tid = lcl_id;  \n");
+
+      // perform compaction / copy data for all threads where the second
+      // child is not dead
+  source.append("         if ((tid < num_threads_active) && (1 == is_active_second))  \n");
+  source.append("         {  \n");
+  source.append("             unsigned int addr_w = num_threads_active + s_compaction_list[tid];  \n");
+  source.append("             s_left[addr_w] = mid;  \n");
+  source.append("             s_right[addr_w] = right;  \n");
+  source.append("             s_left_count[addr_w] = mid_count;  \n");
+  source.append("             s_right_count[addr_w] = right_count;  \n");
+  source.append("         }  \n");
+  source.append("     }  \n");
+  }
+
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeIntervalConverged(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     storeIntervalConverged( __local "); source.append(numeric_string); source.append(" *s_left,   \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_right,   \n");
+  source.append("                            __local unsigned int *s_left_count, \n");
+  source.append("                            __local unsigned int *s_right_count,  \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *left,   \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *mid,   \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *right,   \n");
+  source.append("                            unsigned int *left_count,     \n");
+  source.append("                            unsigned int *mid_count,      \n");
+  source.append("                            unsigned int *right_count,     \n");
+  source.append("                            __local unsigned int *s_compaction_list_exc,  \n");
+  source.append("                            __local unsigned int *compact_second_chunk,  \n");
+  source.append("                            const unsigned int num_threads_active,  \n");
+  source.append("                            unsigned int *is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         const unsigned int tid = lcl_id;  \n");
+  source.append("         const unsigned int multiplicity = *right_count - *left_count;  \n");
+      // check multiplicity of eigenvalue
+  source.append("         if (1 == multiplicity)  \n");
+  source.append("         {  \n");
+
+          // just re-store intervals, simple eigenvalue
+  source.append("             s_left[tid] = *left;  \n");
+  source.append("             s_right[tid] = *right;  \n");
+  source.append("             s_left_count[tid] = *left_count;  \n");
+  source.append("             s_right_count[tid] = *right_count;  \n");
+  source.append("             \n");
+
+          // mark that no second child / clear
+  source.append("             *is_active_second = 0;  \n");
+  source.append("             s_compaction_list_exc[tid] = 0;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // number of eigenvalues after the split less than mid
+  source.append("             *mid_count = *left_count + (multiplicity >> 1);  \n");
+
+          // store left interval
+  source.append("             s_left[tid] = *left;  \n");
+  source.append("             s_right[tid] = *right;  \n");
+  source.append("             s_left_count[tid] = *left_count;  \n");
+  source.append("             s_right_count[tid] = *mid_count;  \n");
+  source.append("             *mid = *left;  \n");
+
+          // mark that second child interval exists
+  source.append("             *is_active_second = 1;  \n");
+  source.append("             s_compaction_list_exc[tid] = 1;  \n");
+  source.append("             *compact_second_chunk = 1;  \n");
+  source.append("         }  \n");
+  source.append("     }  \n");
+  }
+
+
+
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(" *s_left,   \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_right,   \n");
+  source.append("                            __local unsigned short *s_left_count, \n");
+  source.append("                            __local unsigned short *s_right_count,  \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *left,   \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *mid,   \n");
+  source.append("                            "); source.append(numeric_string); source.append(" *right,   \n");
+  source.append("                            unsigned int *left_count,     \n");
+  source.append("                            unsigned int *mid_count,      \n");
+  source.append("                            unsigned int *right_count,     \n");
+  source.append("                            __local unsigned short *s_compaction_list_exc,  \n");
+  source.append("                            __local unsigned int *compact_second_chunk,  \n");
+  source.append("                            const unsigned int num_threads_active,  \n");
+  source.append("                            unsigned int *is_active_second)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+  source.append("         const unsigned int tid = lcl_id;  \n");
+  source.append("         const unsigned int multiplicity = *right_count - *left_count;  \n");
+      // check multiplicity of eigenvalue
+  source.append("         if (1 == multiplicity)  \n");
+  source.append("         {  \n");
+
+          // just re-store intervals, simple eigenvalue
+  source.append("             s_left[tid] = *left;  \n");
+  source.append("             s_right[tid] = *right;  \n");
+  source.append("             s_left_count[tid] = *left_count;  \n");
+  source.append("             s_right_count[tid] = *right_count;  \n");
+  source.append("             \n");
+
+          // mark that no second child / clear
+  source.append("             *is_active_second = 0;  \n");
+  source.append("             s_compaction_list_exc[tid] = 0;  \n");
+  source.append("         }  \n");
+  source.append("         else  \n");
+  source.append("         {  \n");
+
+          // number of eigenvalues after the split less than mid
+  source.append("             *mid_count = *left_count + (multiplicity >> 1);  \n");
+
+          // store left interval
+  source.append("             s_left[tid] = *left;  \n");
+  source.append("             s_right[tid] = *right;  \n");
+  source.append("             s_left_count[tid] = *left_count;  \n");
+  source.append("             s_right_count[tid] = *mid_count;  \n");
+  source.append("             *mid = *left;  \n");
+
+          // mark that second child interval exists
+  source.append("             *is_active_second = 1;  \n");
+  source.append("             s_compaction_list_exc[tid] = 1;  \n");
+  source.append("             *compact_second_chunk = 1;  \n");
+  source.append("         }  \n");
+  source.append("     }  \n");
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Subdivide interval if active and not already converged
+  //
+  // tid                    id of thread
+  // s_left                 shared memory storage for left interval limits
+  // s_right                shared memory storage for right interval limits
+  // s_left_count           shared memory storage for number of eigenvalues less than left interval limits
+  // s_right_count          shared memory storage for number of eigenvalues less than right interval limits
+  // num_threads_active     number of active threads in warp
+  // left                   lower limit of interval
+  // right                  upper limit of interval
+  // left_count             eigenvalues less than \a left
+  // right_count            eigenvalues less than \a right
+  // all_threads_converged  shared memory flag if all threads are converged
+  ///////////////////////////////////////////////////////////////////////////////
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_subdivideActiveInterval(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     subdivideActiveIntervalMulti(const unsigned int tid,  \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_left,    \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_right,   \n");
+  source.append("                             __local unsigned int *s_left_count,   \n");
+  source.append("                             __local unsigned int *s_right_count,  \n");
+  source.append("                             const unsigned int num_threads_active,  \n");
+  source.append("                              "); source.append(numeric_string); source.append(" *left,   \n");
+  source.append("                              "); source.append(numeric_string); source.append(" *right,   \n");
+  source.append("                             unsigned int *left_count, unsigned int *right_count,  \n");
+  source.append("                              "); source.append(numeric_string); source.append(" *mid,    \n");
+  source.append("                              __local unsigned int *all_threads_converged)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+      // for all active threads
+  source.append("         if (tid < num_threads_active)  \n");
+  source.append("         {  \n");
+
+  source.append("             *left = s_left[tid];  \n");
+  source.append("             *right = s_right[tid];  \n");
+  source.append("             *left_count = s_left_count[tid];  \n");
+  source.append("             *right_count = s_right_count[tid];  \n");
+
+          // check if thread already converged
+  source.append("             if (*left != *right)  \n");
+  source.append("             {  \n");
+
+  source.append("                 *mid = computeMidpoint(*left, *right);  \n");
+  source.append("                 *all_threads_converged = 0;  \n");
+  source.append("             }  \n");
+  source.append("             else if ((*right_count - *left_count) > 1)  \n");
+  source.append("             {  \n");
+              // mark as not converged if multiple eigenvalues enclosed
+              // duplicate interval in storeIntervalsConverged()
+  source.append("                 *all_threads_converged = 0;  \n");
+  source.append("             }  \n");
+
+  source.append("         }    \n");
+  // end for all active threads
+  source.append("     }  \n");
+  }
+
+
+  template<typename StringType>
+  void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & source, std::string const & numeric_string)
+  {
+  source.append("       \n");
+  source.append("     void  \n");
+  source.append("     subdivideActiveIntervalShort(const unsigned int tid,  \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_left,    \n");
+  source.append("                             __local "); source.append(numeric_string); source.append(" *s_right,   \n");
+  source.append("                             __local unsigned short *s_left_count,   \n");
+  source.append("                             __local unsigned short *s_right_count,  \n");
+  source.append("                             const unsigned int num_threads_active,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" *left,   \n");
+  source.append("                             "); source.append(numeric_string); source.append(" *right,   \n");
+  source.append("                             unsigned int *left_count, unsigned int *right_count,  \n");
+  source.append("                             "); source.append(numeric_string); source.append(" *mid,    \n");
+  source.append("                             __local unsigned int *all_threads_converged)  \n");
+  source.append("     {  \n");
+  source.append("         uint glb_id = get_global_id(0); \n");
+  source.append("         uint grp_id = get_group_id(0); \n");
+  source.append("         uint grp_nm = get_num_groups(0); \n");
+  source.append("         uint lcl_id = get_local_id(0); \n");
+  source.append("         uint lcl_sz = get_local_size(0); \n");
+
+      // for all active threads
+  source.append("         if (tid < num_threads_active)  \n");
+  source.append("         {  \n");
+
+  source.append("             *left = s_left[tid];  \n");
+  source.append("             *right = s_right[tid];  \n");
+  source.append("             *left_count = s_left_count[tid];  \n");
+  source.append("             *right_count = s_right_count[tid];  \n");
+
+          // check if thread already converged
+  source.append("             if (*left != *right)  \n");
+  source.append("             {  \n");
+
+  source.append("                 *mid = computeMidpoint(*left, *right);  \n");
+  source.append("                 *all_threads_converged = 0;  \n");
+  source.append("             }  \n");
+
+  source.append("         }    \n");
+  // end for all active threads
+  source.append("     }  \n");
+  }
+
+  // end of utilities
+  // start of kernels
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
+  //
+  // g_d             diagonal elements in global memory
+  // g_s             superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0
+  // n               size of matrix
+  // lg              lower bound of input interval (e.g. Gerschgorin interval)
+  // ug              upper bound of input interval (e.g. Gerschgorin interval)
+  // lg_eig_count    number of eigenvalues that are smaller than \a lg
+  // lu_eig_count    number of eigenvalues that are smaller than \a lu
+  // epsilon         desired accuracy of eigenvalues to compute
+  ////////////////////////////////////////////////////////////////////////////////
+  ///
+  template <typename StringType>
+  void generate_bisect_kernel_bisectKernel(StringType & source, std::string const & numeric_string)
+  {
+      source.append("     __kernel  \n");
+      source.append("     void  \n");
+      source.append("     bisectKernelSmall(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+      source.append("                  __global "); source.append(numeric_string); source.append(" *g_s,   \n");
+      source.append("                  const unsigned int n,  \n");
+      source.append("                  __global "); source.append(numeric_string); source.append(" *g_left,   \n");
+      source.append("                  __global "); source.append(numeric_string); source.append(" *g_right,  \n");
+      source.append("                  __global unsigned int *g_left_count, __global unsigned int *g_right_count,  \n");
+      source.append("                  const "); source.append(numeric_string); source.append(" lg,  \n");
+      source.append("                  const "); source.append(numeric_string); source.append(" ug,  \n");
+      source.append("                  const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
+      source.append("                  "); source.append(numeric_string); source.append(" epsilon  \n");
+      source.append("                 )  \n");
+      source.append("     {  \n");
+      source.append("         g_s = g_s + 1; \n");
+      source.append("         uint glb_id = get_global_id(0); \n");
+      source.append("         uint grp_id = get_group_id(0); \n");
+      source.append("         uint grp_nm = get_num_groups(0); \n");
+      source.append("         uint lcl_id = get_local_id(0); \n");
+      source.append("         uint lcl_sz = get_local_size(0); \n");
+
+          // intervals (store left and right because the subdivision tree is in general
+          // not dense
+      source.append("         __local "); source.append(numeric_string); source.append(" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
+      source.append("         __local "); source.append(numeric_string); source.append(" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
+
+          // number of eigenvalues that are smaller than s_left / s_right
+          // (correspondence is realized via indices)
+      source.append("         __local  unsigned int  s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
+      source.append("         __local  unsigned int  s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
+
+          // helper for stream compaction
+      source.append("         __local  unsigned int  \n");
+      source.append("           s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1];  \n");
+
+          // state variables for whole block
+          // if 0 then compaction of second chunk of child intervals is not necessary
+          // (because all intervals had exactly one non-dead child)
+      source.append("         __local  unsigned int compact_second_chunk;  \n");
+      source.append("         __local  unsigned int all_threads_converged;  \n");
+
+          // number of currently active threads
+      source.append("         __local  unsigned int num_threads_active;  \n");
+
+          // number of threads to use for stream compaction
+      source.append("         __local  unsigned int num_threads_compaction;  \n");
+
+          // helper for exclusive scan
+      source.append("         __local unsigned int *s_compaction_list_exc = s_compaction_list + 1;  \n");
+
+
+          // variables for currently processed interval
+          // left and right limit of active interval
+      source.append("          "); source.append(numeric_string); source.append(" left = 0.0f;  \n");
+      source.append("          "); source.append(numeric_string); source.append(" right = 0.0f;  \n");
+      source.append("         unsigned int left_count = 0;  \n");
+      source.append("         unsigned int right_count = 0;  \n");
+          // midpoint of active interval
+      source.append("          "); source.append(numeric_string); source.append(" mid = 0.0f;  \n");
+          // number of eigenvalues smaller then mid
+      source.append("         unsigned int mid_count = 0;  \n");
+          // affected from compaction
+      source.append("         unsigned int  is_active_second = 0;  \n");
+
+      source.append("         s_compaction_list[lcl_id] = 0;  \n");
+      source.append("         s_left[lcl_id] = 0.0;  \n");
+      source.append("         s_right[lcl_id] = 0.0;  \n");
+      source.append("         s_left_count[lcl_id] = 0;  \n");
+      source.append("         s_right_count[lcl_id] = 0;  \n");
+
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+          // set up initial configuration
+      source.append("         if (0 == lcl_id)  \n");
+      source.append("         {  \n");
+      source.append("             s_left[0] = lg;  \n");
+      source.append("             s_right[0] = ug;  \n");
+      source.append("             s_left_count[0] = lg_eig_count;  \n");
+      source.append("             s_right_count[0] = ug_eig_count;  \n");
+
+      source.append("             compact_second_chunk = 0;  \n");
+      source.append("             num_threads_active = 1;  \n");
+
+      source.append("             num_threads_compaction = 1;  \n");
+      source.append("         }  \n");
+
+          // for all active threads read intervals from the last level
+          // the number of (worst case) active threads per level l is 2^l
+
+      source.append("         while (true)  \n");
+      source.append("         {  \n");
+
+      source.append("             all_threads_converged = 1;  \n");
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("             is_active_second = 0;  \n");
+      source.append("             subdivideActiveIntervalMulti(lcl_id,  \n");
+      source.append("                                     s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                     num_threads_active,  \n");
+      source.append("                                     &left, &right, &left_count, &right_count,  \n");
+      source.append("                                     &mid, &all_threads_converged);  \n");
+   //   source.append("             output[lcl_id] = s_left;  \n");
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // check if done
+      source.append("             if (1 == all_threads_converged)  \n");
+      source.append("             {  \n");
+      source.append("                 break;  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // compute number of eigenvalues smaller than mid
+              // use all threads for reading the necessary matrix data from global
+              // memory
+              // use s_left and s_right as scratch space for diagonal and
+              // superdiagonal of matrix
+      source.append("             mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid,  \n");
+      source.append("                                                    lcl_id, num_threads_active,  \n");
+      source.append("                                                    s_left, s_right,  \n");
+      source.append("                                                    (left == right));  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // store intervals
+              // for all threads store the first child interval in a continuous chunk of
+              // memory, and the second child interval -- if it exists -- in a second
+              // chunk; it is likely that all threads reach convergence up to
+              // \a epsilon at the same level; furthermore, for higher level most / all
+              // threads will have only one child, storing the first child compactly will
+              // (first) avoid to perform a compaction step on the first chunk, (second)
+              // make it for higher levels (when all threads / intervals have
+              // exactly one child)  unnecessary to perform a compaction of the second
+              // chunk
+      source.append("             if (lcl_id < num_threads_active)  \n");
+      source.append("             {  \n");
+
+      source.append("                 if (left != right)  \n");
+      source.append("                 {  \n");
+
+                      // store intervals
+      source.append("                     storeNonEmptyIntervals(lcl_id, num_threads_active,  \n");
+      source.append("                                            s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                            left, mid, right,  \n");
+      source.append("                                            left_count, mid_count, right_count,  \n");
+      source.append("                                            epsilon, &compact_second_chunk,  \n");
+      source.append("                                            s_compaction_list_exc,  \n");
+      source.append("                                            &is_active_second);  \n");
+      source.append("                 }  \n");
+      source.append("                 else  \n");
+      source.append("                 {  \n");
+
+      source.append("                     storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                            &left, &mid, &right,  \n");
+      source.append("                                            &left_count, &mid_count, &right_count,  \n");
+      source.append("                                            s_compaction_list_exc, &compact_second_chunk,  \n");
+      source.append("                                            num_threads_active,  \n");
+      source.append("                                            &is_active_second);  \n");
+      source.append("                 }  \n");
+      source.append("             }  \n");
+
+              // necessary so that compact_second_chunk is up-to-date
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // perform compaction of chunk where second children are stored
+              // scan of (num_threads_actieigenvaluesve / 2) elements, thus at most
+              // (num_threads_active / 4) threads are needed
+      source.append("             if (compact_second_chunk > 0)  \n");
+      source.append("             {  \n");
+
+      source.append("                 createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);  \n");
+
+      source.append("                 compactIntervals(s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                  mid, right, mid_count, right_count,  \n");
+      source.append("                                  s_compaction_list, num_threads_active,  \n");
+      source.append("                                  is_active_second);  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("             if (0 == lcl_id)  \n");
+      source.append("             {  \n");
+
+                  // update number of active threads with result of reduction
+      source.append("                 num_threads_active += s_compaction_list[num_threads_active];  \n");
+
+      source.append("                 num_threads_compaction = ceilPow2(num_threads_active);  \n");
+
+      source.append("                 compact_second_chunk = 0;  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("         }  \n");
+
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+          // write resulting intervals to global mem
+          // for all threads write if they have been converged to an eigenvalue to
+          // a separate array
+
+          // at most n valid intervals
+      source.append("         if (lcl_id < n)  \n");
+      source.append("         {  \n");
+              // intervals converged so left and right limit are identical
+      source.append("             g_left[lcl_id]  = s_left[lcl_id];  \n");
+              // left count is sufficient to have global order
+      source.append("             g_left_count[lcl_id]  = s_left_count[lcl_id];  \n");
+      source.append("         }  \n");
+      source.append("     }  \n");
+     }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Perform second step of bisection algorithm for large matrices for intervals that after the first step contained more than one eigenvalue
+  //
+  // g_d              diagonal elements of symmetric, tridiagonal matrix
+  // g_s              superdiagonal elements of symmetric, tridiagonal matrix
+  // n                matrix size
+  // blocks_mult      start addresses of blocks of intervals that are processed by one block of threads, each of the intervals contains more than one eigenvalue
+  // blocks_mult_sum  total number of eigenvalues / singleton intervals in one block of intervals
+  // g_left           left limits of intervals
+  // g_right          right limits of intervals
+  // g_left_count     number of eigenvalues less than left limits
+  // g_right_count    number of eigenvalues less than right limits
+  // g_lambda         final eigenvalue
+  // g_pos            index of eigenvalue (in ascending order)
+  // precision         desired precision of eigenvalues
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & source, std::string const & numeric_string)
+  {
+      source.append("     __kernel  \n");
+      source.append("     void  \n");
+      source.append("     bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+      source.append("                                     __global "); source.append(numeric_string); source.append(" *g_s,   \n");
+      source.append("                                     const unsigned int n,  \n");
+      source.append("                                     __global unsigned int *blocks_mult,  \n");
+      source.append("                                     __global unsigned int *blocks_mult_sum,  \n");
+      source.append("                                     __global "); source.append(numeric_string); source.append(" *g_left,   \n");
+      source.append("                                     __global "); source.append(numeric_string); source.append(" *g_right,  \n");
+      source.append("                                     __global unsigned int *g_left_count,  \n");
+      source.append("                                     __global unsigned int *g_right_count,  \n");
+      source.append("                                     __global  "); source.append(numeric_string); source.append(" *g_lambda, \n");
+      source.append("                                     __global unsigned int *g_pos,  \n");
+      source.append("                                     "); source.append(numeric_string); source.append(" precision  \n");
+      source.append("                                    )  \n");
+      source.append("     {  \n");
+      source.append("         g_s = g_s + 1; \n");
+      source.append("         uint glb_id = get_global_id(0); \n");
+      source.append("         uint grp_id = get_group_id(0); \n");
+      source.append("         uint grp_nm = get_num_groups(0); \n");
+      source.append("         uint lcl_id = get_local_id(0); \n");
+      source.append("         uint lcl_sz = get_local_size(0); \n");
+
+      source.append("       const unsigned int tid = lcl_id;  \n");
+
+          // left and right limits of interval
+      source.append("         __local "); source.append(numeric_string); source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
+      source.append("         __local "); source.append(numeric_string); source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
+
+          // number of eigenvalues smaller than interval limits
+      source.append("         __local  unsigned int  s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
+      source.append("         __local  unsigned int  s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
+
+          // helper array for chunk compaction of second chunk
+      source.append("         __local  unsigned int  s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
+          // compaction list helper for exclusive scan
+      source.append("         __local unsigned int *s_compaction_list_exc = s_compaction_list + 1;  \n");
+
+          // flag if all threads are converged
+      source.append("         __local  unsigned int  all_threads_converged;  \n");
+          // number of active threads
+      source.append("         __local  unsigned int  num_threads_active;  \n");
+          // number of threads to employ for compaction
+      source.append("         __local  unsigned int  num_threads_compaction;  \n");
+          // flag if second chunk has to be compacted
+      source.append("         __local  unsigned int compact_second_chunk;  \n");
+
+          // parameters of block of intervals processed by this block of threads
+      source.append("         __local  unsigned int  c_block_start;  \n");
+      source.append("         __local  unsigned int  c_block_end;  \n");
+      source.append("         __local  unsigned int  c_block_offset_output;  \n");
+
+          // midpoint of currently active interval of the thread
+      source.append("         "); source.append(numeric_string); source.append(" mid = 0.0f;  \n");
+          // number of eigenvalues smaller than \a mid
+      source.append("         unsigned int  mid_count = 0;  \n");
+          // current interval parameter
+      source.append("         "); source.append(numeric_string); source.append(" left = 0.0f;  \n");
+      source.append("         "); source.append(numeric_string); source.append(" right = 0.0f;  \n");
+      source.append("         unsigned int  left_count = 0;  \n");
+      source.append("         unsigned int  right_count = 0;  \n");
+          // helper for compaction, keep track which threads have a second child
+      source.append("         unsigned int  is_active_second = 0;  \n");
+
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE);            \n");
+
+          // initialize common start conditions
+      source.append("         if (0 == tid)  \n");
+      source.append("         {  \n");
+
+      source.append("             c_block_start = blocks_mult[grp_id];  \n");
+      source.append("             c_block_end = blocks_mult[grp_id + 1];  \n");
+      source.append("             c_block_offset_output = blocks_mult_sum[grp_id];  \n");
+      source.append("               \n");
+
+      source.append("             num_threads_active = c_block_end - c_block_start;  \n");
+      source.append("             s_compaction_list[0] = 0;  \n");
+      source.append("             num_threads_compaction = ceilPow2(num_threads_active);  \n");
+
+      source.append("             all_threads_converged = 1;  \n");
+      source.append("             compact_second_chunk = 0;  \n");
+      source.append("         }  \n");
+      source.append("          s_left_count [tid] = 42;  \n");
+      source.append("          s_right_count[tid] = 42;  \n");
+      source.append("          s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
+      source.append("          s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
+      source.append("           \n");
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+      source.append("           \n");
+
+          // read data into shared memory
+      source.append("         if (tid < num_threads_active)  \n");
+      source.append("         {  \n");
+
+      source.append("             s_left[tid]  = g_left[c_block_start + tid];  \n");
+      source.append("             s_right[tid] = g_right[c_block_start + tid];  \n");
+      source.append("             s_left_count[tid]  = g_left_count[c_block_start + tid];  \n");
+      source.append("             s_right_count[tid] = g_right_count[c_block_start + tid];  \n");
+      source.append("               \n");
+      source.append("         }  \n");
+      source.append("        \n");
+      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+      source.append("         unsigned int iter = 0;  \n");
+          // do until all threads converged
+      source.append("         while (true)  \n");
+      source.append("         {  \n");
+      source.append("             iter++;  \n");
+              //for (int iter=0; iter < 0; iter++) {
+      source.append("             s_compaction_list[lcl_id] = 0;  \n");
+      source.append("             s_compaction_list[lcl_id + lcl_sz] = 0;  \n");
+      source.append("             s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
+
+              // subdivide interval if currently active and not already converged
+      source.append("             subdivideActiveIntervalMulti(tid, s_left, s_right,  \n");
+      source.append("                                     s_left_count, s_right_count,  \n");
+      source.append("                                     num_threads_active,  \n");
+      source.append("                                     &left, &right, &left_count, &right_count,  \n");
+      source.append("                                     &mid, &all_threads_converged);  \n");
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // stop if all eigenvalues have been found
+      source.append("             if (1 == all_threads_converged)  \n");
+      source.append("             {  \n");
+      source.append("                  \n");
+      source.append("                 break;  \n");
+      source.append("             }  \n");
+
+              // compute number of eigenvalues smaller than mid for active and not
+              // converged intervals, use all threads for loading data from gmem and
+              // s_left and s_right as scratch space to store the data load from gmem
+              // in shared memory
+      source.append("             mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,  \n");
+      source.append("                                                         mid, tid, num_threads_active,  \n");
+      source.append("                                                         s_left, s_right,  \n");
+      source.append("                                                         (left == right));  \n");
+      source.append("                                                \n");
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("             if (tid < num_threads_active)  \n");
+      source.append("             {  \n");
+      source.append("                   \n");
+                  // store intervals
+      source.append("                 if (left != right)  \n");
+      source.append("                 {  \n");
+
+      source.append("                     storeNonEmptyIntervals(tid, num_threads_active,  \n");
+      source.append("                                            s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                            left, mid, right,  \n");
+      source.append("                                            left_count, mid_count, right_count,  \n");
+      source.append("                                            precision, &compact_second_chunk,  \n");
+      source.append("                                            s_compaction_list_exc,  \n");
+      source.append("                                            &is_active_second);  \n");
+      source.append("                      \n");
+      source.append("                 }  \n");
+      source.append("                 else  \n");
+      source.append("                 {  \n");
+
+      source.append("                     storeIntervalConverged(s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                            &left, &mid, &right,  \n");
+      source.append("                                            &left_count, &mid_count, &right_count,  \n");
+      source.append("                                            s_compaction_list_exc, &compact_second_chunk,  \n");
+      source.append("                                            num_threads_active,  \n");
+      source.append("                                            &is_active_second);  \n");
+      source.append("                   \n");
+      source.append("                 }  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // compact second chunk of intervals if any of the threads generated
+              // two child intervals
+      source.append("             if (1 == compact_second_chunk)  \n");
+      source.append("             {  \n");
+
+      source.append("                 createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);  \n");
+      source.append("                 compactIntervals(s_left, s_right, s_left_count, s_right_count,  \n");
+      source.append("                                  mid, right, mid_count, right_count,  \n");
+      source.append("                                  s_compaction_list, num_threads_active,  \n");
+      source.append("                                  is_active_second);  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // update state variables
+      source.append("             if (0 == tid)  \n");
+      source.append("             {  \n");
+      source.append("                 num_threads_active += s_compaction_list[num_threads_active];  \n");
+      source.append("                 num_threads_compaction = ceilPow2(num_threads_active);  \n");
+
+      source.append("                 compact_second_chunk = 0;  \n");
+      source.append("                 all_threads_converged = 1;  \n");
+      source.append("             }  \n");
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+              // clear
+      source.append("             s_compaction_list_exc[lcl_id] = 0;  \n");
+      source.append("             s_compaction_list_exc[lcl_id + lcl_sz] = 0;   \n");
+      source.append("               \n");
+      source.append("             if (num_threads_compaction > lcl_sz)              \n");
+      source.append("             {  \n");
+      source.append("               break;  \n");
+      source.append("             }  \n");
+
+
+      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
+
+      source.append("    } \n"); // end until all threads converged
+
+          // write data back to global memory
+      source.append("         if (tid < num_threads_active)  \n");
+      source.append("         {  \n");
+
+      source.append("             unsigned int addr = c_block_offset_output + tid;  \n");
+      source.append("               \n");
+      source.append("             g_lambda[addr]  = s_left[tid];  \n");
+      source.append("             g_pos[addr]   = s_right_count[tid];  \n");
+      source.append("         }  \n");
+      source.append("     }  \n");
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Determine eigenvalues for large matrices for intervals that after the first step contained one eigenvalue
+  //
+  // g_d            diagonal elements of symmetric, tridiagonal matrix
+  // g_s            superdiagonal elements of symmetric, tridiagonal matrix
+  // n              matrix size
+  // num_intervals  total number of intervals containing one eigenvalue after the first step
+  // g_left         left interval limits
+  // g_right        right interval limits
+  // g_pos          index of interval / number of intervals that are smaller than right interval limit
+  // precision      desired precision of eigenvalues
+  ////////////////////////////////////////////////////////////////////////////////
+
+  template <typename StringType>
+  void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & source, std::string const & numeric_string)
+  {
+      source.append("     __kernel  \n");
+      source.append("     void  \n");
+      source.append("     bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(" *g_d,   \n");
+      source.append("                                    __global "); source.append(numeric_string); source.append(" *g_s,    \n");
+      source.append("                                    const unsigned int n,  \n");
+      source.append("                                    unsigned int num_intervals,  \n");
+      source.append("                                    __global "); source.append(numeric_string); source.append(" *g_left,  \n");
+      source.append("                                    __global "); source.append(numeric_string); source.append(" *g_right,  \n");
+      source.append("                                    __global unsigned int *g_pos,  \n");
+      source.append("                                    "); source.append(numeric_string); source.append(" precision)  \n");
+      source.append("     {  \n");
+      source.append("         g_s = g_s + 1; \n");
+      source.append("         uint glb_id = get_global_id(0); \n");
+      source.append("         uint grp_id = get_group_id(0); \n");
+      source.append("         uint grp_nm = get_num_groups(0); \n");
+      source.append("         uint lcl_id = get_local_id(0); \n");
+      source.append("         uint lcl_sz = get_local_size(0); \n");
+      source.append("         const unsigned int gtid = (lcl_sz * grp_id) + lcl_id;  \n");
+      source.append("         __loca

<TRUNCATED>