You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:09 UTC
[12/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
new file mode 100644
index 0000000..64c12b0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
@@ -0,0 +1,2645 @@
+#ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
+#define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/opencl/kernels/bisect.hpp
+ @brief OpenCL kernels for the bisection algorithm for eigenvalues
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/ocl/utils.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+
+// declaration, forward
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace kernels
+{
+ template <typename StringType>
+ void generate_bisect_kernel_config(StringType & source)
+ {
+ /* Global configuration parameter */
+ source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK 256\n");
+ source.append(" #define VIENNACL_BISECT_MAX_SMALL_MATRIX 256\n");
+ source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX 256\n");
+ source.append(" #define VIENNACL_BISECT_MIN_ABS_INTERVAL 5.0e-37\n");
+
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute the next lower power of two of n
+ // n number for which next higher power of two is seeked
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_floorPow2(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline int \n");
+ source.append(" floorPow2(int n) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ // early out if already power of two
+ source.append(" if (0 == (n & (n-1))) \n");
+ source.append(" { \n");
+ source.append(" return n; \n");
+ source.append(" } \n");
+
+ source.append(" int exp; \n");
+ source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n");
+ source.append(" return (1 << (exp - 1)); \n");
+ source.append(" } \n");
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute the next higher power of two of n
+ // n number for which next higher power of two is seeked
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_ceilPow2(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline int \n");
+ source.append(" ceilPow2(int n) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ // early out if already power of two
+ source.append(" if (0 == (n & (n-1))) \n");
+ source.append(" { \n");
+ source.append(" return n; \n");
+ source.append(" } \n");
+
+ source.append(" int exp; \n");
+ source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n");
+ source.append(" return (1 << exp); \n");
+ source.append(" } \n");
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute midpoint of interval [\a left, \a right] avoiding overflow if possible
+ //
+ // left left / lower limit of interval
+ // right right / upper limit of interval
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_computeMidpoint(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline "); source.append(numeric_string); source.append(" \n");
+ source.append(" computeMidpoint(const "); source.append(numeric_string); source.append(" left,\n");
+ source.append(" const "); source.append(numeric_string); source.append(" right) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+ source.append(" "); source.append(numeric_string); source.append(" mid; \n");
+
+ source.append(" if (sign(left) == sign(right)) \n");
+ source.append(" { \n");
+ source.append(" mid = left + (right - left) * 0.5f; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" mid = (left + right) * 0.5f; \n");
+ source.append(" } \n");
+
+ source.append(" return mid; \n");
+ source.append(" } \n");
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Check if interval converged and store appropriately
+ //
+ // addr address where to store the information of the interval
+ // s_left shared memory storage for left interval limits
+ // s_right shared memory storage for right interval limits
+ // s_left_count shared memory storage for number of eigenvalues less than left interval limits
+ // s_right_count shared memory storage for number of eigenvalues less than right interval limits
+ // left lower limit of interval
+ // right upper limit of interval
+ // left_count eigenvalues less than \a left
+ // right_count eigenvalues less than \a right
+ // precision desired precision for eigenvalues
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeInterval(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeInterval(unsigned int addr, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n");
+ source.append(" __local unsigned int * s_left_count, \n");
+ source.append(" __local unsigned int * s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right, \n");
+ source.append(" unsigned int left_count, \n");
+ source.append(" unsigned int right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" s_left_count[addr] = left_count; \n");
+ source.append(" s_right_count[addr] = right_count; \n");
+
+ // check if interval converged
+ source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n");
+ source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n");
+
+ source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n");
+ source.append(" { \n");
+ // compute mid point
+ source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
+
+ // mark as converged
+ source.append(" s_left[addr] = lambda; \n");
+ source.append(" s_right[addr] = lambda; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // store current limits
+ source.append(" s_left[addr] = left; \n");
+ source.append(" s_right[addr] = right; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+
+ }
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeIntervalShort(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeIntervalShort(unsigned int addr, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n");
+ source.append(" __local unsigned short * s_left_count, \n");
+ source.append(" __local unsigned short * s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right, \n");
+ source.append(" unsigned int left_count, \n");
+ source.append(" unsigned int right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" s_left_count[addr] = left_count; \n");
+ source.append(" s_right_count[addr] = right_count; \n");
+
+ // check if interval converged
+ source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n");
+ source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n");
+
+ source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n");
+ source.append(" { \n");
+ // compute mid point
+ source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n");
+
+ // mark as converged
+ source.append(" s_left[addr] = lambda; \n");
+ source.append(" s_right[addr] = lambda; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // store current limits
+ source.append(" s_left[addr] = left; \n");
+ source.append(" s_right[addr] = right; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute number of eigenvalues that are smaller than x given a symmetric,
+ // real, and tridiagonal matrix
+ //
+ // g_d diagonal elements stored in global memory
+ // g_s superdiagonal elements stored in global memory
+ // n size of matrix
+ // x value for which the number of eigenvalues that are smaller is sought
+ // tid thread identified (e.g. threadIdx.x or gtid)
+ // num_intervals_active number of active intervals / threads that currently process an interval
+ // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory
+ // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
+ // converged flag if the current thread is already converged (that is count does not have to be computed)
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline unsigned int \n");
+ source.append(" computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" const "); source.append(numeric_string); source.append(" x, \n");
+ source.append(" const unsigned int tid, \n");
+ source.append(" const unsigned int num_intervals_active, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n");
+ source.append(" unsigned int converged \n");
+ source.append(" ) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n");
+ source.append(" unsigned int count = 0; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // read data into shared memory
+ source.append(" if (lcl_id < n) \n");
+ source.append(" { \n");
+ source.append(" s_d[lcl_id] = *(g_d + lcl_id); \n");
+ source.append(" s_s[lcl_id] = *(g_s + lcl_id - 1); \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // perform loop only for active threads
+ source.append(" if ((tid < num_intervals_active) && (0 == converged)) \n");
+ source.append(" { \n");
+
+ // perform (optimized) Gaussian elimination to determine the number
+ // of eigenvalues that are smaller than n
+ source.append(" for (unsigned int k = 0; k < n; ++k) \n");
+ source.append(" { \n");
+ source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
+ source.append(" count += (delta < 0) ? 1 : 0; \n");
+ source.append(" } \n");
+
+ source.append(" } \n"); // end if thread currently processing an interval
+
+ source.append(" return count; \n");
+ source.append(" } \n");
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Compute number of eigenvalues that are smaller than x given a symmetric,
+ // real, and tridiagonal matrix
+ //
+ // g_d diagonal elements stored in global memory
+ // g_s superdiagonal elements stored in global memory
+ // n size of matrix
+ // x value for which the number of eigenvalues that are smaller is seeked
+ // tid thread identified (e.g. threadIdx.x or gtid)
+ // num_intervals_active number of active intervals / threads that currently process an interval
+ // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory
+ // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory
+ // converged flag if the current thread is already converged (that is count does not have to be computed)
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" inline unsigned int \n");
+ source.append(" computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" const "); source.append(numeric_string); source.append(" x, \n");
+ source.append(" const unsigned int tid, \n");
+ source.append(" const unsigned int num_intervals_active, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n");
+ source.append(" unsigned int converged \n");
+ source.append(" ) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n");
+ source.append(" unsigned int count = 0; \n");
+
+ source.append(" unsigned int rem = n; \n");
+
+ // do until whole diagonal and superdiagonal has been loaded and processed
+ source.append(" for (unsigned int i = 0; i < n; i += lcl_sz) \n");
+ source.append(" { \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // read new chunk of data into shared memory
+ source.append(" if ((i + lcl_id) < n) \n");
+ source.append(" { \n");
+
+ source.append(" s_d[lcl_id] = *(g_d + i + lcl_id); \n");
+ source.append(" s_s[lcl_id] = *(g_s + i + lcl_id - 1); \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+
+ source.append(" if (tid < num_intervals_active) \n");
+ source.append(" { \n");
+
+ // perform (optimized) Gaussian elimination to determine the number
+ // of eigenvalues that are smaller than n
+ source.append(" for (unsigned int k = 0; k < min(rem,lcl_sz); ++k) \n");
+ source.append(" { \n");
+ source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n");
+ // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
+ source.append(" count += (delta < 0) ? 1 : 0; \n");
+ source.append(" } \n");
+
+ source.append(" } \n"); // end if thread currently processing an interval
+
+ source.append(" rem -= lcl_sz; \n");
+ source.append(" } \n");
+
+ source.append(" return count; \n");
+ source.append(" } \n");
+
+
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Store all non-empty intervals resulting from the subdivision of the interval
+ // currently processed by the thread
+ //
+ // addr base address for storing intervals
+ // num_threads_active number of threads / intervals in current sweep
+ // s_left shared memory storage for left interval limits
+ // s_right shared memory storage for right interval limits
+ // s_left_count shared memory storage for number of eigenvalues less than left interval limits
+ // s_right_count shared memory storage for number of eigenvalues less than right interval limits
+ // left lower limit of interval
+ // mid midpoint of interval
+ // right upper limit of interval
+ // left_count eigenvalues less than \a left
+ // mid_count eigenvalues less than \a mid
+ // right_count eigenvalues less than \a right
+ // precision desired precision for eigenvalues
+ // compact_second_chunk shared mem flag if second chunk is used and ergo requires compaction
+ // s_compaction_list_exc helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
+ // is_active_interval mark is thread has a second non-empty child interval
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeNonEmptyIntervals(unsigned int addr, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned int *s_left_count, \n");
+ source.append(" __local unsigned int *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" left, \n ");
+ source.append(" "); source.append(numeric_string); source.append(" mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right,\n");
+ source.append(" const unsigned int left_count, \n");
+ source.append(" const unsigned int mid_count, \n");
+ source.append(" const unsigned int right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision, \n");
+ source.append(" __local unsigned int *compact_second_chunk, \n");
+ source.append(" __local unsigned int *s_compaction_list_exc, \n");
+ source.append(" unsigned int *is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // check if both child intervals are valid
+ source.append(" \n");
+ source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n");
+ source.append(" { \n");
+
+ // store the left interval
+ source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, left_count, mid_count, precision); \n");
+
+ // mark that a second interval has been generated, only stored after
+ // stream compaction of second chunk
+ source.append(" *is_active_second = 1; \n");
+ source.append(" s_compaction_list_exc[lcl_id] = 1; \n");
+ source.append(" *compact_second_chunk = 1; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // only one non-empty child interval
+
+ // mark that no second child
+ source.append(" *is_active_second = 0; \n");
+ source.append(" s_compaction_list_exc[lcl_id] = 0; \n");
+
+ // store the one valid child interval
+ source.append(" if (left_count != mid_count) \n");
+ source.append(" { \n");
+ source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, left_count, mid_count, precision); \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" mid, right, mid_count, right_count, precision); \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ source.append(" } \n");
+
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ //! Store all non-empty intervals resulting from the subdivision of the interval
+ //! currently processed by the thread
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeNonEmptyIntervalsLarge(unsigned int addr, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned short *s_left_count, \n");
+ source.append(" __local unsigned short *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" left, \n ");
+ source.append(" "); source.append(numeric_string); source.append(" mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right,\n");
+ source.append(" const unsigned int left_count, \n");
+ source.append(" const unsigned int mid_count, \n");
+ source.append(" const unsigned int right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" epsilon, \n");
+ source.append(" __local unsigned int *compact_second_chunk, \n");
+ source.append(" __local unsigned short *s_compaction_list, \n");
+ source.append(" unsigned int *is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // check if both child intervals are valid
+ source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n");
+ source.append(" { \n");
+
+ source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, left_count, mid_count, epsilon); \n");
+
+ source.append(" *is_active_second = 1; \n");
+ source.append(" s_compaction_list[lcl_id] = 1; \n");
+ source.append(" *compact_second_chunk = 1; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // only one non-empty child interval
+
+ // mark that no second child
+ source.append(" *is_active_second = 0; \n");
+ source.append(" s_compaction_list[lcl_id] = 0; \n");
+
+ // store the one valid child interval
+ source.append(" if (left_count != mid_count) \n");
+ source.append(" { \n");
+ source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, left_count, mid_count, epsilon); \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+ source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" mid, right, mid_count, right_count, epsilon); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Create indices for compaction, that is process \a s_compaction_list_exc
+ // which is 1 for intervals that generated a second child and 0 otherwise
+ // and create for each of the non-zero elements the index where the new
+ // interval belongs to in a compact representation of all generated second children
+ //
+ // s_compaction_list_exc list containing the flags which threads generated two children
+ // num_threads_compaction number of threads to employ for compaction
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template<typename StringType>
+ void generate_bisect_kernel_createIndicesCompaction(StringType & source)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" createIndicesCompaction(__local unsigned int *s_compaction_list_exc, \n");
+ source.append(" unsigned int num_threads_compaction) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ source.append(" unsigned int offset = 1; \n");
+ source.append(" const unsigned int tid = lcl_id; \n");
+ // if(tid == 0)
+ // printf("num_threads_compaction = %u\n", num_threads_compaction);
+
+ // higher levels of scan tree
+ source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
+ source.append(" { \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < d) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
+ source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
+ source.append(" \n");
+ source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
+ source.append(" + s_compaction_list_exc[ai]; \n");
+ source.append(" } \n");
+
+ source.append(" offset <<= 1; \n");
+ source.append(" } \n");
+
+ // traverse down tree: first down to level 2 across
+ source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
+ source.append(" { \n");
+
+ source.append(" offset >>= 1; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < (d-1)) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
+ source.append(" unsigned int bi = ai + (offset >> 1); \n");
+
+ source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
+ source.append(" + s_compaction_list_exc[ai]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" } \n");
+ }
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_createIndicesCompactionShort(StringType & source)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc, \n");
+ source.append(" unsigned int num_threads_compaction) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+
+ source.append(" unsigned int offset = 1; \n");
+ source.append(" const unsigned int tid = lcl_id; \n");
+
+ // higher levels of scan tree
+ source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n");
+ source.append(" { \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < d) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int ai = offset*(2*tid+1)-1; \n");
+ source.append(" unsigned int bi = offset*(2*tid+2)-1; \n");
+ source.append(" \n");
+ source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
+ source.append(" + s_compaction_list_exc[ai]; \n");
+ source.append(" } \n");
+
+ source.append(" offset <<= 1; \n");
+ source.append(" } \n");
+
+ // traverse down tree: first down to level 2 across
+ source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n");
+ source.append(" { \n");
+
+ source.append(" offset >>= 1; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < (d-1)) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int ai = offset*(tid+1) - 1; \n");
+ source.append(" unsigned int bi = ai + (offset >> 1); \n");
+
+ source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n");
+ source.append(" + s_compaction_list_exc[ai]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" } \n");
+ }
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Perform stream compaction for second child intervals
+ //
+ // s_left shared memory storage for left interval limits
+ // s_right shared memory storage for right interval limits
+ // s_left_count shared memory storage for number of eigenvalues less than left interval limits
+ // s_right_count shared memory storage for number of eigenvalues less than right interval limits
+ // mid midpoint of current interval (left of new interval)
+ // right upper limit of interval
+ // mid_count eigenvalues less than \a mid
+ // s_compaction_list list containing the indices where the data has to be stored
+ // num_threads_active number of active threads / intervals
+ // is_active_interval mark is thread has a second non-empty child interval
+ ///////////////////////////////////////////////////////////////////////////////
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_compactIntervals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" compactIntervals(__local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned int *s_left_count, \n");
+ source.append(" __local unsigned int *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right, \n");
+ source.append(" unsigned int mid_count, unsigned int right_count, \n");
+ source.append(" __local unsigned int *s_compaction_list, \n");
+ source.append(" unsigned int num_threads_active, \n");
+ source.append(" unsigned int is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+
+ // perform compaction / copy data for all threads where the second
+ // child is not dead
+ source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
+ source.append(" s_left[addr_w] = mid; \n");
+ source.append(" s_right[addr_w] = right; \n");
+ source.append(" s_left_count[addr_w] = mid_count; \n");
+ source.append(" s_right_count[addr_w] = right_count; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_compactIntervalsShort(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" compactIntervalsShort(__local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned short *s_left_count, \n");
+ source.append(" __local unsigned short *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" right, \n");
+ source.append(" unsigned int mid_count, unsigned int right_count, \n");
+ source.append(" __local unsigned short *s_compaction_list, \n");
+ source.append(" unsigned int num_threads_active, \n");
+ source.append(" unsigned int is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+
+ // perform compaction / copy data for all threads where the second
+ // child is not dead
+ source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n");
+ source.append(" { \n");
+ source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n");
+ source.append(" s_left[addr_w] = mid; \n");
+ source.append(" s_right[addr_w] = right; \n");
+ source.append(" s_left_count[addr_w] = mid_count; \n");
+ source.append(" s_right_count[addr_w] = right_count; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeIntervalConverged(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeIntervalConverged( __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned int *s_left_count, \n");
+ source.append(" __local unsigned int *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *right, \n");
+ source.append(" unsigned int *left_count, \n");
+ source.append(" unsigned int *mid_count, \n");
+ source.append(" unsigned int *right_count, \n");
+ source.append(" __local unsigned int *s_compaction_list_exc, \n");
+ source.append(" __local unsigned int *compact_second_chunk, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" unsigned int *is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+ source.append(" const unsigned int multiplicity = *right_count - *left_count; \n");
+ // check multiplicity of eigenvalue
+ source.append(" if (1 == multiplicity) \n");
+ source.append(" { \n");
+
+ // just re-store intervals, simple eigenvalue
+ source.append(" s_left[tid] = *left; \n");
+ source.append(" s_right[tid] = *right; \n");
+ source.append(" s_left_count[tid] = *left_count; \n");
+ source.append(" s_right_count[tid] = *right_count; \n");
+ source.append(" \n");
+
+ // mark that no second child / clear
+ source.append(" *is_active_second = 0; \n");
+ source.append(" s_compaction_list_exc[tid] = 0; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // number of eigenvalues after the split less than mid
+ source.append(" *mid_count = *left_count + (multiplicity >> 1); \n");
+
+ // store left interval
+ source.append(" s_left[tid] = *left; \n");
+ source.append(" s_right[tid] = *right; \n");
+ source.append(" s_left_count[tid] = *left_count; \n");
+ source.append(" s_right_count[tid] = *mid_count; \n");
+ source.append(" *mid = *left; \n");
+
+ // mark that second child interval exists
+ source.append(" *is_active_second = 1; \n");
+ source.append(" s_compaction_list_exc[tid] = 1; \n");
+ source.append(" *compact_second_chunk = 1; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+
+
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned short *s_left_count, \n");
+ source.append(" __local unsigned short *s_right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *right, \n");
+ source.append(" unsigned int *left_count, \n");
+ source.append(" unsigned int *mid_count, \n");
+ source.append(" unsigned int *right_count, \n");
+ source.append(" __local unsigned short *s_compaction_list_exc, \n");
+ source.append(" __local unsigned int *compact_second_chunk, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" unsigned int *is_active_second) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+ source.append(" const unsigned int multiplicity = *right_count - *left_count; \n");
+ // check multiplicity of eigenvalue
+ source.append(" if (1 == multiplicity) \n");
+ source.append(" { \n");
+
+ // just re-store intervals, simple eigenvalue
+ source.append(" s_left[tid] = *left; \n");
+ source.append(" s_right[tid] = *right; \n");
+ source.append(" s_left_count[tid] = *left_count; \n");
+ source.append(" s_right_count[tid] = *right_count; \n");
+ source.append(" \n");
+
+ // mark that no second child / clear
+ source.append(" *is_active_second = 0; \n");
+ source.append(" s_compaction_list_exc[tid] = 0; \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ // number of eigenvalues after the split less than mid
+ source.append(" *mid_count = *left_count + (multiplicity >> 1); \n");
+
+ // store left interval
+ source.append(" s_left[tid] = *left; \n");
+ source.append(" s_right[tid] = *right; \n");
+ source.append(" s_left_count[tid] = *left_count; \n");
+ source.append(" s_right_count[tid] = *mid_count; \n");
+ source.append(" *mid = *left; \n");
+
+ // mark that second child interval exists
+ source.append(" *is_active_second = 1; \n");
+ source.append(" s_compaction_list_exc[tid] = 1; \n");
+ source.append(" *compact_second_chunk = 1; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+ ///////////////////////////////////////////////////////////////////////////////
+ // Subdivide interval if active and not already converged
+ //
+ // tid id of thread
+ // s_left shared memory storage for left interval limits
+ // s_right shared memory storage for right interval limits
+ // s_left_count shared memory storage for number of eigenvalues less than left interval limits
+ // s_right_count shared memory storage for number of eigenvalues less than right interval limits
+ // num_threads_active number of active threads in warp
+ // left lower limit of interval
+ // right upper limit of interval
+ // left_count eigenvalues less than \a left
+ // right_count eigenvalues less than \a right
+ // all_threads_converged shared memory flag if all threads are converged
+ ///////////////////////////////////////////////////////////////////////////////
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_subdivideActiveInterval(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" subdivideActiveIntervalMulti(const unsigned int tid, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned int *s_left_count, \n");
+ source.append(" __local unsigned int *s_right_count, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *right, \n");
+ source.append(" unsigned int *left_count, unsigned int *right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
+ source.append(" __local unsigned int *all_threads_converged) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // for all active threads
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" *left = s_left[tid]; \n");
+ source.append(" *right = s_right[tid]; \n");
+ source.append(" *left_count = s_left_count[tid]; \n");
+ source.append(" *right_count = s_right_count[tid]; \n");
+
+ // check if thread already converged
+ source.append(" if (*left != *right) \n");
+ source.append(" { \n");
+
+ source.append(" *mid = computeMidpoint(*left, *right); \n");
+ source.append(" *all_threads_converged = 0; \n");
+ source.append(" } \n");
+ source.append(" else if ((*right_count - *left_count) > 1) \n");
+ source.append(" { \n");
+ // mark as not converged if multiple eigenvalues enclosed
+ // duplicate interval in storeIntervalsConverged()
+ source.append(" *all_threads_converged = 0; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ // end for all active threads
+ source.append(" } \n");
+ }
+
+
+ template<typename StringType>
+ void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" \n");
+ source.append(" void \n");
+ source.append(" subdivideActiveIntervalShort(const unsigned int tid, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n");
+ source.append(" __local unsigned short *s_left_count, \n");
+ source.append(" __local unsigned short *s_right_count, \n");
+ source.append(" const unsigned int num_threads_active, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *left, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *right, \n");
+ source.append(" unsigned int *left_count, unsigned int *right_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" *mid, \n");
+ source.append(" __local unsigned int *all_threads_converged) \n");
+ source.append(" { \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // for all active threads
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" *left = s_left[tid]; \n");
+ source.append(" *right = s_right[tid]; \n");
+ source.append(" *left_count = s_left_count[tid]; \n");
+ source.append(" *right_count = s_right_count[tid]; \n");
+
+ // check if thread already converged
+ source.append(" if (*left != *right) \n");
+ source.append(" { \n");
+
+ source.append(" *mid = computeMidpoint(*left, *right); \n");
+ source.append(" *all_threads_converged = 0; \n");
+ source.append(" } \n");
+
+ source.append(" } \n");
+ // end for all active threads
+ source.append(" } \n");
+ }
+
+ // end of utilities
+ // start of kernels
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
+ //
+ // g_d diagonal elements in global memory
+ // g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0
+ // n size of matrix
+ // lg lower bound of input interval (e.g. Gerschgorin interval)
+ // ug upper bound of input interval (e.g. Gerschgorin interval)
+ // lg_eig_count number of eigenvalues that are smaller than \a lg
+ // lu_eig_count number of eigenvalues that are smaller than \a lu
+ // epsilon desired accuracy of eigenvalues to compute
+ ////////////////////////////////////////////////////////////////////////////////
+ ///
+ template <typename StringType>
+ void generate_bisect_kernel_bisectKernel(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" __kernel \n");
+ source.append(" void \n");
+ source.append(" bisectKernelSmall(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
+ source.append(" __global unsigned int *g_left_count, __global unsigned int *g_right_count, \n");
+ source.append(" const "); source.append(numeric_string); source.append(" lg, \n");
+ source.append(" const "); source.append(numeric_string); source.append(" ug, \n");
+ source.append(" const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n");
+ source.append(" "); source.append(numeric_string); source.append(" epsilon \n");
+ source.append(" ) \n");
+ source.append(" { \n");
+ source.append(" g_s = g_s + 1; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ // intervals (store left and right because the subdivision tree is in general
+ // not dense
+ source.append(" __local "); source.append(numeric_string); source.append(" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
+
+ // number of eigenvalues that are smaller than s_left / s_right
+ // (correspondence is realized via indices)
+ source.append(" __local unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
+ source.append(" __local unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n");
+
+ // helper for stream compaction
+ source.append(" __local unsigned int \n");
+ source.append(" s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; \n");
+
+ // state variables for whole block
+ // if 0 then compaction of second chunk of child intervals is not necessary
+ // (because all intervals had exactly one non-dead child)
+ source.append(" __local unsigned int compact_second_chunk; \n");
+ source.append(" __local unsigned int all_threads_converged; \n");
+
+ // number of currently active threads
+ source.append(" __local unsigned int num_threads_active; \n");
+
+ // number of threads to use for stream compaction
+ source.append(" __local unsigned int num_threads_compaction; \n");
+
+ // helper for exclusive scan
+ source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
+
+
+ // variables for currently processed interval
+ // left and right limit of active interval
+ source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
+ source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
+ source.append(" unsigned int left_count = 0; \n");
+ source.append(" unsigned int right_count = 0; \n");
+ // midpoint of active interval
+ source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
+ // number of eigenvalues smaller then mid
+ source.append(" unsigned int mid_count = 0; \n");
+ // affected from compaction
+ source.append(" unsigned int is_active_second = 0; \n");
+
+ source.append(" s_compaction_list[lcl_id] = 0; \n");
+ source.append(" s_left[lcl_id] = 0.0; \n");
+ source.append(" s_right[lcl_id] = 0.0; \n");
+ source.append(" s_left_count[lcl_id] = 0; \n");
+ source.append(" s_right_count[lcl_id] = 0; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // set up initial configuration
+ source.append(" if (0 == lcl_id) \n");
+ source.append(" { \n");
+ source.append(" s_left[0] = lg; \n");
+ source.append(" s_right[0] = ug; \n");
+ source.append(" s_left_count[0] = lg_eig_count; \n");
+ source.append(" s_right_count[0] = ug_eig_count; \n");
+
+ source.append(" compact_second_chunk = 0; \n");
+ source.append(" num_threads_active = 1; \n");
+
+ source.append(" num_threads_compaction = 1; \n");
+ source.append(" } \n");
+
+ // for all active threads read intervals from the last level
+ // the number of (worst case) active threads per level l is 2^l
+
+ source.append(" while (true) \n");
+ source.append(" { \n");
+
+ source.append(" all_threads_converged = 1; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" is_active_second = 0; \n");
+ source.append(" subdivideActiveIntervalMulti(lcl_id, \n");
+ source.append(" s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" num_threads_active, \n");
+ source.append(" &left, &right, &left_count, &right_count, \n");
+ source.append(" &mid, &all_threads_converged); \n");
+ // source.append(" output[lcl_id] = s_left; \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // check if done
+ source.append(" if (1 == all_threads_converged) \n");
+ source.append(" { \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // compute number of eigenvalues smaller than mid
+ // use all threads for reading the necessary matrix data from global
+ // memory
+ // use s_left and s_right as scratch space for diagonal and
+ // superdiagonal of matrix
+ source.append(" mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, \n");
+ source.append(" lcl_id, num_threads_active, \n");
+ source.append(" s_left, s_right, \n");
+ source.append(" (left == right)); \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // store intervals
+ // for all threads store the first child interval in a continuous chunk of
+ // memory, and the second child interval -- if it exists -- in a second
+ // chunk; it is likely that all threads reach convergence up to
+ // \a epsilon at the same level; furthermore, for higher level most / all
+ // threads will have only one child, storing the first child compactly will
+ // (first) avoid to perform a compaction step on the first chunk, (second)
+ // make it for higher levels (when all threads / intervals have
+ // exactly one child) unnecessary to perform a compaction of the second
+ // chunk
+ source.append(" if (lcl_id < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" if (left != right) \n");
+ source.append(" { \n");
+
+ // store intervals
+ source.append(" storeNonEmptyIntervals(lcl_id, num_threads_active, \n");
+ source.append(" s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, right, \n");
+ source.append(" left_count, mid_count, right_count, \n");
+ source.append(" epsilon, &compact_second_chunk, \n");
+ source.append(" s_compaction_list_exc, \n");
+ source.append(" &is_active_second); \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" &left, &mid, &right, \n");
+ source.append(" &left_count, &mid_count, &right_count, \n");
+ source.append(" s_compaction_list_exc, &compact_second_chunk, \n");
+ source.append(" num_threads_active, \n");
+ source.append(" &is_active_second); \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ // necessary so that compact_second_chunk is up-to-date
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // perform compaction of chunk where second children are stored
+ // scan of (num_threads_actieigenvaluesve / 2) elements, thus at most
+ // (num_threads_active / 4) threads are needed
+ source.append(" if (compact_second_chunk > 0) \n");
+ source.append(" { \n");
+
+ source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
+
+ source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" mid, right, mid_count, right_count, \n");
+ source.append(" s_compaction_list, num_threads_active, \n");
+ source.append(" is_active_second); \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (0 == lcl_id) \n");
+ source.append(" { \n");
+
+ // update number of active threads with result of reduction
+ source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
+
+ source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
+
+ source.append(" compact_second_chunk = 0; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // write resulting intervals to global mem
+ // for all threads write if they have been converged to an eigenvalue to
+ // a separate array
+
+ // at most n valid intervals
+ source.append(" if (lcl_id < n) \n");
+ source.append(" { \n");
+ // intervals converged so left and right limit are identical
+ source.append(" g_left[lcl_id] = s_left[lcl_id]; \n");
+ // left count is sufficient to have global order
+ source.append(" g_left_count[lcl_id] = s_left_count[lcl_id]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Perform second step of bisection algorithm for large matrices for intervals that after the first step contained more than one eigenvalue
+ //
+ // g_d diagonal elements of symmetric, tridiagonal matrix
+ // g_s superdiagonal elements of symmetric, tridiagonal matrix
+ // n matrix size
+ // blocks_mult start addresses of blocks of intervals that are processed by one block of threads, each of the intervals contains more than one eigenvalue
+ // blocks_mult_sum total number of eigenvalues / singleton intervals in one block of intervals
+ // g_left left limits of intervals
+ // g_right right limits of intervals
+ // g_left_count number of eigenvalues less than left limits
+ // g_right_count number of eigenvalues less than right limits
+ // g_lambda final eigenvalue
+ // g_pos index of eigenvalue (in ascending order)
+ // precision desired precision of eigenvalues
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" __kernel \n");
+ source.append(" void \n");
+ source.append(" bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" __global unsigned int *blocks_mult, \n");
+ source.append(" __global unsigned int *blocks_mult_sum, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
+ source.append(" __global unsigned int *g_left_count, \n");
+ source.append(" __global unsigned int *g_right_count, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_lambda, \n");
+ source.append(" __global unsigned int *g_pos, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision \n");
+ source.append(" ) \n");
+ source.append(" { \n");
+ source.append(" g_s = g_s + 1; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+
+ source.append(" const unsigned int tid = lcl_id; \n");
+
+ // left and right limits of interval
+ source.append(" __local "); source.append(numeric_string); source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
+ source.append(" __local "); source.append(numeric_string); source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
+
+ // number of eigenvalues smaller than interval limits
+ source.append(" __local unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
+ source.append(" __local unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n");
+
+ // helper array for chunk compaction of second chunk
+ source.append(" __local unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n");
+ // compaction list helper for exclusive scan
+ source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n");
+
+ // flag if all threads are converged
+ source.append(" __local unsigned int all_threads_converged; \n");
+ // number of active threads
+ source.append(" __local unsigned int num_threads_active; \n");
+ // number of threads to employ for compaction
+ source.append(" __local unsigned int num_threads_compaction; \n");
+ // flag if second chunk has to be compacted
+ source.append(" __local unsigned int compact_second_chunk; \n");
+
+ // parameters of block of intervals processed by this block of threads
+ source.append(" __local unsigned int c_block_start; \n");
+ source.append(" __local unsigned int c_block_end; \n");
+ source.append(" __local unsigned int c_block_offset_output; \n");
+
+ // midpoint of currently active interval of the thread
+ source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n");
+ // number of eigenvalues smaller than \a mid
+ source.append(" unsigned int mid_count = 0; \n");
+ // current interval parameter
+ source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n");
+ source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n");
+ source.append(" unsigned int left_count = 0; \n");
+ source.append(" unsigned int right_count = 0; \n");
+ // helper for compaction, keep track which threads have a second child
+ source.append(" unsigned int is_active_second = 0; \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
+
+ // initialize common start conditions
+ source.append(" if (0 == tid) \n");
+ source.append(" { \n");
+
+ source.append(" c_block_start = blocks_mult[grp_id]; \n");
+ source.append(" c_block_end = blocks_mult[grp_id + 1]; \n");
+ source.append(" c_block_offset_output = blocks_mult_sum[grp_id]; \n");
+ source.append(" \n");
+
+ source.append(" num_threads_active = c_block_end - c_block_start; \n");
+ source.append(" s_compaction_list[0] = 0; \n");
+ source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
+
+ source.append(" all_threads_converged = 1; \n");
+ source.append(" compact_second_chunk = 0; \n");
+ source.append(" } \n");
+ source.append(" s_left_count [tid] = 42; \n");
+ source.append(" s_right_count[tid] = 42; \n");
+ source.append(" s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
+ source.append(" s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
+ source.append(" \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+ source.append(" \n");
+
+ // read data into shared memory
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" s_left[tid] = g_left[c_block_start + tid]; \n");
+ source.append(" s_right[tid] = g_right[c_block_start + tid]; \n");
+ source.append(" s_left_count[tid] = g_left_count[c_block_start + tid]; \n");
+ source.append(" s_right_count[tid] = g_right_count[c_block_start + tid]; \n");
+ source.append(" \n");
+ source.append(" } \n");
+ source.append(" \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+ source.append(" unsigned int iter = 0; \n");
+ // do until all threads converged
+ source.append(" while (true) \n");
+ source.append(" { \n");
+ source.append(" iter++; \n");
+ //for (int iter=0; iter < 0; iter++) {
+ source.append(" s_compaction_list[lcl_id] = 0; \n");
+ source.append(" s_compaction_list[lcl_id + lcl_sz] = 0; \n");
+ source.append(" s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n");
+
+ // subdivide interval if currently active and not already converged
+ source.append(" subdivideActiveIntervalMulti(tid, s_left, s_right, \n");
+ source.append(" s_left_count, s_right_count, \n");
+ source.append(" num_threads_active, \n");
+ source.append(" &left, &right, &left_count, &right_count, \n");
+ source.append(" &mid, &all_threads_converged); \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // stop if all eigenvalues have been found
+ source.append(" if (1 == all_threads_converged) \n");
+ source.append(" { \n");
+ source.append(" \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+
+ // compute number of eigenvalues smaller than mid for active and not
+ // converged intervals, use all threads for loading data from gmem and
+ // s_left and s_right as scratch space to store the data load from gmem
+ // in shared memory
+ source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n");
+ source.append(" mid, tid, num_threads_active, \n");
+ source.append(" s_left, s_right, \n");
+ source.append(" (left == right)); \n");
+ source.append(" \n");
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+ source.append(" \n");
+ // store intervals
+ source.append(" if (left != right) \n");
+ source.append(" { \n");
+
+ source.append(" storeNonEmptyIntervals(tid, num_threads_active, \n");
+ source.append(" s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" left, mid, right, \n");
+ source.append(" left_count, mid_count, right_count, \n");
+ source.append(" precision, &compact_second_chunk, \n");
+ source.append(" s_compaction_list_exc, \n");
+ source.append(" &is_active_second); \n");
+ source.append(" \n");
+ source.append(" } \n");
+ source.append(" else \n");
+ source.append(" { \n");
+
+ source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" &left, &mid, &right, \n");
+ source.append(" &left_count, &mid_count, &right_count, \n");
+ source.append(" s_compaction_list_exc, &compact_second_chunk, \n");
+ source.append(" num_threads_active, \n");
+ source.append(" &is_active_second); \n");
+ source.append(" \n");
+ source.append(" } \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // compact second chunk of intervals if any of the threads generated
+ // two child intervals
+ source.append(" if (1 == compact_second_chunk) \n");
+ source.append(" { \n");
+
+ source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n");
+ source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n");
+ source.append(" mid, right, mid_count, right_count, \n");
+ source.append(" s_compaction_list, num_threads_active, \n");
+ source.append(" is_active_second); \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // update state variables
+ source.append(" if (0 == tid) \n");
+ source.append(" { \n");
+ source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n");
+ source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n");
+
+ source.append(" compact_second_chunk = 0; \n");
+ source.append(" all_threads_converged = 1; \n");
+ source.append(" } \n");
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ // clear
+ source.append(" s_compaction_list_exc[lcl_id] = 0; \n");
+ source.append(" s_compaction_list_exc[lcl_id + lcl_sz] = 0; \n");
+ source.append(" \n");
+ source.append(" if (num_threads_compaction > lcl_sz) \n");
+ source.append(" { \n");
+ source.append(" break; \n");
+ source.append(" } \n");
+
+
+ source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n");
+
+ source.append(" } \n"); // end until all threads converged
+
+ // write data back to global memory
+ source.append(" if (tid < num_threads_active) \n");
+ source.append(" { \n");
+
+ source.append(" unsigned int addr = c_block_offset_output + tid; \n");
+ source.append(" \n");
+ source.append(" g_lambda[addr] = s_left[tid]; \n");
+ source.append(" g_pos[addr] = s_right_count[tid]; \n");
+ source.append(" } \n");
+ source.append(" } \n");
+ }
+
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Determine eigenvalues for large matrices for intervals that after the first step contained one eigenvalue
+ //
+ // g_d diagonal elements of symmetric, tridiagonal matrix
+ // g_s superdiagonal elements of symmetric, tridiagonal matrix
+ // n matrix size
+ // num_intervals total number of intervals containing one eigenvalue after the first step
+ // g_left left interval limits
+ // g_right right interval limits
+ // g_pos index of interval / number of intervals that are smaller than right interval limit
+ // precision desired precision of eigenvalues
+ ////////////////////////////////////////////////////////////////////////////////
+
+ template <typename StringType>
+ void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & source, std::string const & numeric_string)
+ {
+ source.append(" __kernel \n");
+ source.append(" void \n");
+ source.append(" bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n");
+ source.append(" const unsigned int n, \n");
+ source.append(" unsigned int num_intervals, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n");
+ source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n");
+ source.append(" __global unsigned int *g_pos, \n");
+ source.append(" "); source.append(numeric_string); source.append(" precision) \n");
+ source.append(" { \n");
+ source.append(" g_s = g_s + 1; \n");
+ source.append(" uint glb_id = get_global_id(0); \n");
+ source.append(" uint grp_id = get_group_id(0); \n");
+ source.append(" uint grp_nm = get_num_groups(0); \n");
+ source.append(" uint lcl_id = get_local_id(0); \n");
+ source.append(" uint lcl_sz = get_local_size(0); \n");
+ source.append(" const unsigned int gtid = (lcl_sz * grp_id) + lcl_id; \n");
+ source.append(" __loca
<TRUNCATED>