You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:11 UTC
[14/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
new file mode 100644
index 0000000..959bbd8
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp
@@ -0,0 +1,108 @@
+#ifndef VIENNACL_LINALG_NORM_INF_HPP_
+#define VIENNACL_LINALG_NORM_INF_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file norm_inf.hpp
+ @brief Generic interface for the l^infty-norm. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+
+namespace viennacl
+{
+ //
+ // generic norm_inf function
+ // uses tag dispatch to identify which algorithm
+ // should be called
+ //
+ namespace linalg
+ {
+
+ #ifdef VIENNACL_WITH_UBLAS
+ // ----------------------------------------------------
+ // UBLAS
+ //
+ template< typename VectorT >
+ typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value,
+ typename VectorT::value_type
+ >::type
+ norm_inf(VectorT const& v1)
+ {
+ return boost::numeric::ublas::norm_inf(v1);
+ }
+ #endif
+
+
+ // ----------------------------------------------------
+ // STL
+ //
+ template< typename T, typename A >
+ T norm_inf(std::vector<T, A> const & v1)
+ {
+ //std::cout << "stl .. " << std::endl;
+ T result = 0;
+ for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i)
+ {
+ if (std::fabs(v1[i]) > result)
+ result = std::fabs(v1[i]);
+ }
+
+ return result;
+ }
+
+ // ----------------------------------------------------
+ // VIENNACL
+ //
+ template< typename ScalarType>
+ viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_norm_inf >
+ norm_inf(viennacl::vector_base<ScalarType> const & v1)
+ {
+ //std::cout << "viennacl .. " << std::endl;
+ return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>,
+ const viennacl::vector_base<ScalarType>,
+ viennacl::op_norm_inf >(v1, v1);
+ }
+
+ // with vector expression:
+ template<typename LHS, typename RHS, typename OP>
+ viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_norm_inf>
+ norm_inf(viennacl::vector_expression<const LHS, const RHS, OP> const & vector)
+ {
+ return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>,
+ const viennacl::vector_expression<const LHS, const RHS, OP>,
+ viennacl::op_norm_inf >(vector, vector);
+ }
+
+
+ } // end namespace linalg
+} // end namespace viennacl
+#endif
+
+
+
+
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
new file mode 100644
index 0000000..7cdcf89
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp
@@ -0,0 +1,458 @@
+#ifndef VIENNACL_LINALG_OPENCL_AMG_OPERATIONS_HPP
+#define VIENNACL_LINALG_OPENCL_AMG_OPERATIONS_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file opencl/amg_operations.hpp
+ @brief Implementations of routines for AMG in OpenCL.
+*/
+
+#include <cstdlib>
+#include <cmath>
+#include <map>
+
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/amg.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace amg
+{
+
+
+///////////////////////////////////////////
+
+/** @brief Routine for taking all connections in the matrix as strong */
+template<typename NumericT>
+void amg_influence_trivial(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+ viennacl::ocl::kernel & influence_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_influence_trivial");
+
+ viennacl::ocl::enqueue(influence_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(),
+ cl_uint(A.size1()),
+ cl_uint(A.nnz()),
+ viennacl::traits::opencl_handle(amg_context.influence_jumper_),
+ viennacl::traits::opencl_handle(amg_context.influence_ids_),
+ viennacl::traits::opencl_handle(amg_context.influence_values_)
+ )
+ );
+}
+
+
+/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */
+template<typename NumericT>
+void amg_influence_advanced(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)A; (void)amg_context; (void)tag;
+ throw std::runtime_error("amg_influence_advanced() not implemented for OpenCL yet");
+}
+
+
+/** @brief Dispatcher for influence processing */
+template<typename NumericT>
+void amg_influence(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ // TODO: dispatch based on influence tolerance provided
+ amg_influence_trivial(A, amg_context, tag);
+}
+
+
+
+/** @brief Assign IDs to coarse points.
+*
+* TODO: Use exclusive_scan on GPU for this.
+*/
+inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context)
+{
+ viennacl::backend::typesafe_host_array<unsigned int> point_types(amg_context.point_types_.handle(), amg_context.point_types_.size());
+ viennacl::backend::typesafe_host_array<unsigned int> coarse_ids(amg_context.coarse_id_.handle(), amg_context.coarse_id_.size());
+ viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get());
+ viennacl::backend::memory_read(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
+
+ unsigned int coarse_id = 0;
+ for (std::size_t i=0; i<amg_context.point_types_.size(); ++i)
+ {
+ coarse_ids.set(i, coarse_id);
+ if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+ ++coarse_id;
+ }
+
+ amg_context.num_coarse_ = coarse_id;
+
+ viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get());
+}
+
+
+//////////////////////////////////////
+
+
+
+/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1
+*
+* @param A Operator matrix on all levels
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ (void)tag;
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+ viennacl::vector<unsigned int> random_weights(A.size1(), viennacl::context(viennacl::MAIN_MEMORY));
+ unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle());
+ for (std::size_t i=0; i<random_weights.size(); ++i)
+ random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1());
+ random_weights.switch_memory_context(viennacl::traits::context(A));
+
+ // work vectors:
+ viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A));
+
+ viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A));
+ viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A));
+
+ unsigned int num_undecided = static_cast<unsigned int>(A.size1());
+ viennacl::vector<unsigned int> undecided_buffer(256, viennacl::traits::context(A));
+ viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size());
+
+ viennacl::ocl::kernel & init_workdata_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_init_workdata");
+ viennacl::ocl::kernel & max_neighborhood_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_max_neighborhood");
+ viennacl::ocl::kernel & mark_mis_nodes_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_mark_mis_nodes");
+ viennacl::ocl::kernel & reset_state_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_reset_state");
+
+ unsigned int pmis_iters = 0;
+ while (num_undecided > 0)
+ {
+ ++pmis_iters;
+
+ //
+ // init temporary work data:
+ //
+ viennacl::ocl::enqueue(init_workdata_kernel(work_state, work_random, work_index,
+ amg_context.point_types_,
+ random_weights,
+ cl_uint(A.size1())
+ )
+ );
+
+ //
+ // Propagate maximum tuple twice
+ //
+ for (unsigned int r = 0; r < 2; ++r)
+ {
+ // max operation
+ viennacl::ocl::enqueue(max_neighborhood_kernel(work_state, work_random, work_index,
+ work_state2, work_random2, work_index2,
+ amg_context.influence_jumper_, amg_context.influence_ids_,
+ cl_uint(A.size1())
+ )
+ );
+
+ // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy)
+ work_state = work_state2;
+ work_random = work_random2;
+ work_index = work_index2;
+ }
+
+ //
+ // mark MIS and non-MIS nodes:
+ //
+ viennacl::ocl::enqueue(mark_mis_nodes_kernel(work_state, work_index,
+ amg_context.point_types_,
+ undecided_buffer,
+ cl_uint(A.size1())
+ )
+ );
+
+ // get number of undecided points on host:
+ viennacl::backend::memory_read(undecided_buffer.handle(), 0, undecided_buffer_host.raw_size(), undecided_buffer_host.get());
+ num_undecided = 0;
+ for (std::size_t i=0; i<undecided_buffer.size(); ++i)
+ num_undecided += undecided_buffer_host[i];
+
+ } //while
+
+ viennacl::ocl::enqueue(reset_state_kernel(amg_context.point_types_, cl_uint(amg_context.point_types_.size()) ) );
+}
+
+
+
+/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG)
+*
+* @param A Operator matrix
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag(compressed_matrix<NumericT> const & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+ amg_influence_trivial(A, amg_context, tag);
+
+ //
+ // Stage 1: Build aggregates:
+ //
+ if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION)
+ amg_coarse_ag_stage1_mis2(A, amg_context, tag);
+ else
+ throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with OpenCL backend!");
+
+ viennacl::linalg::opencl::amg::enumerate_coarse_points(amg_context);
+
+ //
+ // Stage 2: Propagate coarse aggregate indices to neighbors:
+ //
+ viennacl::ocl::kernel & propagate_coarse_indices = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_propagate_coarse_indices");
+ viennacl::ocl::enqueue(propagate_coarse_indices(amg_context.point_types_,
+ amg_context.coarse_id_,
+ amg_context.influence_jumper_,
+ amg_context.influence_ids_,
+ cl_uint(A.size1())
+ )
+ );
+
+ //
+ // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy
+ //
+ viennacl::ocl::kernel & merge_undecided = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_merge_undecided");
+ viennacl::ocl::enqueue(merge_undecided(amg_context.point_types_,
+ amg_context.coarse_id_,
+ amg_context.influence_jumper_,
+ amg_context.influence_ids_,
+ cl_uint(A.size1())
+ )
+ );
+
+ //
+ // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3)
+ // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution).
+ //
+ viennacl::ocl::kernel & merge_undecided_2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_merge_undecided_2");
+ viennacl::ocl::enqueue(merge_undecided_2(amg_context.point_types_, cl_uint(A.size1()) ) );
+
+}
+
+
+
+
+/** @brief Calls the right coarsening procedure
+*
+* @param A Operator matrix on all levels
+* @param amg_context AMG hierarchy datastructures
+* @param tag AMG preconditioner tag
+*/
+template<typename InternalT1>
+void amg_coarse(InternalT1 & A,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ switch (tag.get_coarsening_method())
+ {
+ case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break;
+ default: throw std::runtime_error("not implemented yet");
+ }
+}
+
+
+
+
+////////////////////////////////////// Interpolation /////////////////////////////
+
+
+/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_ag(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+ (void)tag;
+ P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+ // build matrix here
+ viennacl::ocl::kernel & interpolate_ag = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_interpol_ag");
+ viennacl::ocl::enqueue(interpolate_ag(P.handle1().opencl_handle(),
+ P.handle2().opencl_handle(),
+ P.handle().opencl_handle(),
+ amg_context.coarse_id_,
+ cl_uint(A.size1())
+ )
+ );
+
+ P.generate_row_block_information();
+}
+
+/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_sa(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx);
+
+ (void)tag;
+ viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+ // form tentative operator:
+ amg_interpol_ag(A, P_tentative, amg_context, tag);
+
+ viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A));
+
+ viennacl::ocl::kernel & interpol_sa = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_interpol_sa");
+ viennacl::ocl::enqueue(interpol_sa(A.handle1().opencl_handle(),
+ A.handle2().opencl_handle(),
+ A.handle().opencl_handle(),
+ cl_uint(A.size1()),
+ cl_uint(A.nnz()),
+ Jacobi.handle1().opencl_handle(),
+ Jacobi.handle2().opencl_handle(),
+ Jacobi.handle().opencl_handle(),
+ NumericT(tag.get_jacobi_weight())
+ )
+ );
+
+ P = viennacl::linalg::prod(Jacobi, P_tentative);
+
+ P.generate_row_block_information();
+}
+
+/** @brief Dispatcher for building the interpolation matrix
+ *
+ * @param A Operator matrix
+ * @param P Prolongation matrix
+ * @param amg_context AMG hierarchy datastructures
+ * @param tag AMG configuration tag
+*/
+template<typename MatrixT>
+void amg_interpol(MatrixT const & A,
+ MatrixT & P,
+ viennacl::linalg::detail::amg::amg_level_context & amg_context,
+ viennacl::linalg::amg_tag & tag)
+{
+ switch (tag.get_interpolation_method())
+ {
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break;
+ case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa (A, P, amg_context, tag); break;
+ default: throw std::runtime_error("Not implemented yet!");
+ }
+}
+
+/** Assign sparse matrix A to dense matrix B */
+template<typename NumericT, unsigned int AlignmentV>
+void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+ viennacl::matrix_base<NumericT> & B)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(),
+ "assign_to_dense");
+
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ viennacl::traits::opencl_handle(B),
+ cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)),
+ cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)),
+ cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)),
+ cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)) ));
+
+}
+
+/** @brief Jacobi Smoother (OpenCL version)
+*
+* @param iterations Number of smoother iterations
+* @param A Operator matrix for the smoothing
+* @param x The vector smoothing is applied to
+* @param x_backup (Different) Vector holding the same values as x
+* @param rhs_smooth The right hand side of the equation for the smoother
+* @param weight Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration
+*/
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+ compressed_matrix<NumericT> const & A,
+ vector<NumericT> & x,
+ vector<NumericT> & x_backup,
+ vector<NumericT> const & rhs_smooth,
+ NumericT weight)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
+ viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx);
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "jacobi");
+
+ for (unsigned int i=0; i<iterations; ++i)
+ {
+ x_backup = x;
+
+ viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
+ static_cast<NumericT>(weight),
+ viennacl::traits::opencl_handle(x_backup),
+ viennacl::traits::opencl_handle(x),
+ viennacl::traits::opencl_handle(rhs_smooth),
+ static_cast<cl_uint>(rhs_smooth.size())));
+
+ }
+}
+
+
+} //namespace amg
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..2fcd6fa
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp
@@ -0,0 +1,177 @@
+#ifndef VIENNACL_LINALG_OPENCL_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_OPENCL_BISECT_KERNEL_CALLS_HPP_
+
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+
+/** @file viennacl/linalg/opencl/bisect_kernel_calls.hpp
+ @brief OpenCL kernel calls for the bisection algorithm
+
+ Implementation based on the sample provided with the CUDA 6.0 SDK, for which
+ the creation of derivative works is allowed by including the following statement:
+ "This software contains source code provided by NVIDIA Corporation."
+*/
+
+// includes, project
+#include "viennacl/linalg/opencl/kernels/bisect.hpp"
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+const std::string BISECT_KERNEL_SMALL = "bisectKernelSmall";
+const std::string BISECT_KERNEL_LARGE = "bisectKernelLarge";
+const std::string BISECT_KERNEL_LARGE_ONE_INTERVALS = "bisectKernelLarge_OneIntervals";
+const std::string BISECT_KERNEL_LARGE_MULT_INTERVALS = "bisectKernelLarge_MultIntervals";
+
+template<typename NumericT>
+void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input,
+ viennacl::linalg::detail::ResultDataSmall<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+ viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_SMALL);
+ kernel.global_work_size(0, 1 * VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX);
+ kernel.local_work_size(0, VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX);
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+ viennacl::traits::opencl_handle(input.g_b),
+ static_cast<cl_uint>(mat_size),
+ viennacl::traits::opencl_handle(result.vcl_g_left),
+ viennacl::traits::opencl_handle(result.vcl_g_right),
+ viennacl::traits::opencl_handle(result.vcl_g_left_count),
+ viennacl::traits::opencl_handle(result.vcl_g_right_count),
+ static_cast<NumericT>(lg),
+ static_cast<NumericT>(ug),
+ static_cast<cl_uint>(0),
+ static_cast<cl_uint>(mat_size),
+ static_cast<NumericT>(precision)
+ ));
+
+ }
+
+template<typename NumericT>
+void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input,
+ viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT lg, const NumericT ug,
+ const NumericT precision)
+ {
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+ viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE);
+ kernel.global_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); // Use only 128 threads for 256 < n <= 512, this
+ kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); // is reasoned
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+ viennacl::traits::opencl_handle(input.g_b),
+ static_cast<cl_uint>(mat_size),
+ static_cast<NumericT>(lg),
+ static_cast<NumericT>(ug),
+ static_cast<cl_uint>(0),
+ static_cast<cl_uint>(mat_size),
+ static_cast<NumericT>(precision),
+ viennacl::traits::opencl_handle(result.g_num_one),
+ viennacl::traits::opencl_handle(result.g_num_blocks_mult),
+ viennacl::traits::opencl_handle(result.g_left_one),
+ viennacl::traits::opencl_handle(result.g_right_one),
+ viennacl::traits::opencl_handle(result.g_pos_one),
+ viennacl::traits::opencl_handle(result.g_left_mult),
+ viennacl::traits::opencl_handle(result.g_right_mult),
+ viennacl::traits::opencl_handle(result.g_left_count_mult),
+ viennacl::traits::opencl_handle(result.g_right_count_mult),
+ viennacl::traits::opencl_handle(result.g_blocks_mult),
+ viennacl::traits::opencl_handle(result.g_blocks_mult_sum)
+ ));
+
+ }
+
+template<typename NumericT>
+void bisectLargeOneIntervals(const viennacl::linalg::detail::InputData<NumericT> &input,
+ viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT precision)
+ {
+ unsigned int num_one_intervals = result.g_num_one;
+ unsigned int num_blocks = viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals,
+ mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK: VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+ viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE_ONE_INTERVALS);
+ kernel.global_work_size(0, num_blocks * (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2));
+ kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+ viennacl::traits::opencl_handle(input.g_b),
+ static_cast<cl_uint>(mat_size),
+ static_cast<cl_uint>(num_one_intervals),
+ viennacl::traits::opencl_handle(result.g_left_one),
+ viennacl::traits::opencl_handle(result.g_right_one),
+ viennacl::traits::opencl_handle(result.g_pos_one),
+ static_cast<NumericT>(precision)
+ ));
+ }
+
+
+template<typename NumericT>
+void bisectLargeMultIntervals(const viennacl::linalg::detail::InputData<NumericT> &input,
+ viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+ const unsigned int mat_size,
+ const NumericT precision)
+ {
+ unsigned int num_blocks_mult = result.g_num_blocks_mult;
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context());
+ viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE_MULT_INTERVALS);
+ kernel.global_work_size(0, num_blocks_mult * (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2));
+ kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+
+ viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a),
+ viennacl::traits::opencl_handle(input.g_b),
+ static_cast<cl_uint>(mat_size),
+ viennacl::traits::opencl_handle(result.g_blocks_mult),
+ viennacl::traits::opencl_handle(result.g_blocks_mult_sum),
+ viennacl::traits::opencl_handle(result.g_left_mult),
+ viennacl::traits::opencl_handle(result.g_right_mult),
+ viennacl::traits::opencl_handle(result.g_left_count_mult),
+ viennacl::traits::opencl_handle(result.g_right_count_mult),
+ viennacl::traits::opencl_handle(result.g_lambda_mult),
+ viennacl::traits::opencl_handle(result.g_pos_mult),
+ static_cast<NumericT>(precision)
+ ));
+ }
+} // namespace opencl
+} // namespace linalg
+} // namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
new file mode 100644
index 0000000..d6a288b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp
@@ -0,0 +1,102 @@
+#ifndef VIENNACL_LINALG_OPENCL_COMMON_HPP_
+#define VIENNACL_LINALG_OPENCL_COMMON_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/common.hpp
+ @brief Common implementations shared by OpenCL-based operations
+*/
+
+#include <cmath>
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/platform.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace detail
+{
+
+
+
+inline cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
+{
+ return static_cast<cl_uint>( ((length > 1) ? (cl_uint(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) );
+}
+
+
+/** @brief Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices. */
+inline std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
+{
+ if (B_transposed)
+ {
+ if (B_row_major && C_row_major)
+ return "trans_mat_mult_row_row";
+ if (B_row_major && !C_row_major)
+ return "trans_mat_mult_row_col";
+ if (!B_row_major && C_row_major)
+ return "trans_mat_mult_col_row";
+
+ return "trans_mat_mult_col_col";
+ }
+
+ if (B_row_major && C_row_major)
+ return "mat_mult_row_row";
+ if (B_row_major && !C_row_major)
+ return "mat_mult_row_col";
+ if (!B_row_major && C_row_major)
+ return "mat_mult_col_row";
+
+ return "mat_mult_col_col";
+}
+
+
+
+template<typename SomeT>
+ocl::device const & current_device(SomeT const & obj) { return traits::opencl_handle(obj).context().current_device(); }
+
+inline std::string op_to_string(op_abs) { return "abs"; }
+inline std::string op_to_string(op_acos) { return "acos"; }
+inline std::string op_to_string(op_asin) { return "asin"; }
+inline std::string op_to_string(op_atan) { return "atan"; }
+inline std::string op_to_string(op_ceil) { return "ceil"; }
+inline std::string op_to_string(op_cos) { return "cos"; }
+inline std::string op_to_string(op_cosh) { return "cosh"; }
+inline std::string op_to_string(op_exp) { return "exp"; }
+inline std::string op_to_string(op_fabs) { return "fabs"; }
+inline std::string op_to_string(op_floor) { return "floor"; }
+inline std::string op_to_string(op_log) { return "log"; }
+inline std::string op_to_string(op_log10) { return "log10"; }
+inline std::string op_to_string(op_sin) { return "sin"; }
+inline std::string op_to_string(op_sinh) { return "sinh"; }
+inline std::string op_to_string(op_sqrt) { return "sqrt"; }
+inline std::string op_to_string(op_tan) { return "tan"; }
+inline std::string op_to_string(op_tanh) { return "tanh"; }
+
+} //namespace detail
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
new file mode 100644
index 0000000..76874b1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp
@@ -0,0 +1,153 @@
+#ifndef VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP
+#define VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/direct_solve.hpp
+ @brief Implementations of dense direct solvers are found here.
+*/
+
+#include "viennacl/vector.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix_solve.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+namespace detail
+{
+ inline cl_uint get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; }
+ inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
+ inline cl_uint get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); }
+ inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
+
+ template<typename MatrixT1, typename MatrixT2, typename KernelT>
+ void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, KernelT & k)
+ {
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+ cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
+ cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
+ cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+ viennacl::traits::opencl_handle(B),
+ cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)),
+ cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)),
+ cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)),
+ cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B))
+ )
+ );
+ }
+}
+
+
+//
+// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
+//
+
+////////////////// upper triangular solver (upper_tag) //////////////////////////////////////
+/** @brief Direct inplace solver for dense triangular systems. Matlab notation: A \ B
+*
+* @param A The system matrix
+* @param B The matrix of row vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_base<NumericT> const & A,
+ matrix_base<NumericT> & B,
+ SolverTagT)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+
+ std::string program_name;
+ if (A.row_major() && B.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, row_major, row_major> KernelClass;
+ KernelClass::init(ctx);
+ program_name = KernelClass::program_name();
+ }
+ else if (A.row_major() && !B.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, row_major, column_major> KernelClass;
+ KernelClass::init(ctx);
+ program_name = KernelClass::program_name();
+ }
+ else if (!A.row_major() && B.row_major())
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, column_major, row_major> KernelClass;
+ KernelClass::init(ctx);
+ program_name = KernelClass::program_name();
+ }
+ else
+ {
+ typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, column_major, column_major> KernelClass;
+ KernelClass::init(ctx);
+ program_name = KernelClass::program_name();
+ }
+
+ std::stringstream ss;
+ ss << SolverTagT::name();
+ ss << "_solve";
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(program_name, ss.str());
+
+ k.global_work_size(0, B.size2() * k.local_work_size());
+ detail::inplace_solve_impl(A, B, k);
+}
+
+
+
+//
+// Solve on vector
+//
+
+template<typename NumericT, typename SOLVERTAG>
+void inplace_solve(matrix_base<NumericT> const & A,
+ vector_base<NumericT> & x,
+ SOLVERTAG)
+{
+ cl_uint options = detail::get_option_for_solver_tag(SOLVERTAG());
+
+ viennacl::ocl::kernel & k = detail::legacy_kernel_for_matrix(A, "triangular_substitute_inplace");
+
+ k.global_work_size(0, k.local_work_size());
+ viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+ cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)),
+ cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)),
+ cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)),
+ cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)),
+ viennacl::traits::opencl_handle(x),
+ cl_uint(viennacl::traits::start(x)),
+ cl_uint(viennacl::traits::stride(x)),
+ cl_uint(viennacl::traits::size(x)),
+ options
+ )
+ );
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
new file mode 100644
index 0000000..a7b12b3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp
@@ -0,0 +1,350 @@
+#ifndef VIENNACL_LINALG_OPENCL_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+ ============================================================================= */
+
+/** @file viennacl/linalg/opencl/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation using OpenCL
+ */
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include <cmath>
+#include <stdexcept>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace fft
+{
+
+ const vcl_size_t MAX_LOCAL_POINTS_NUM = 512;
+
+ /**
+ * @brief Get number of bits
+ */
+ inline vcl_size_t num_bits(vcl_size_t size)
+ {
+ vcl_size_t bits_datasize = 0;
+ vcl_size_t ds = 1;
+
+ while (ds < size)
+ {
+ ds = ds << 1;
+ bits_datasize++;
+ }
+
+ return bits_datasize;
+ }
+
+ /**
+ * @brief Find next power of two
+ */
+ inline vcl_size_t next_power_2(vcl_size_t n)
+ {
+ n = n - 1;
+
+ vcl_size_t power = 1;
+
+ while (power < sizeof(vcl_size_t) * 8)
+ {
+ n = n | (n >> power);
+ power *= 2;
+ }
+
+ return n + 1;
+ }
+
+} //namespce fft
+} //namespace detail
+
+namespace opencl
+{
+
+/**
+ * @brief Direct algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT>
+void direct(viennacl::ocl::handle<cl_mem> const & in,
+ viennacl::ocl::handle<cl_mem> const & out,
+ vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+ if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+ {
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+ program_string =
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+ } else
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_direct");
+ viennacl::ocl::enqueue(k(in, out,
+ static_cast<cl_uint>(size),
+ static_cast<cl_uint>(stride),
+ static_cast<cl_uint>(batch_num),
+ sign)
+ );
+}
+
+/*
+ * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT>
+void reorder(viennacl::ocl::handle<cl_mem> const & in,
+ vcl_size_t size, vcl_size_t stride,
+ vcl_size_t bits_datasize, vcl_size_t batch_num,
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+ if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+ {
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+ program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+ } else
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(program_string, "fft_reorder");
+ viennacl::ocl::enqueue(k(in,
+ static_cast<cl_uint>(bits_datasize), static_cast<cl_uint>(size),
+ static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num))
+ );
+}
+
+/**
+ * @brief Radix-2 algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT>
+void radix2(viennacl::ocl::handle<cl_mem> const & in,
+ vcl_size_t size, vcl_size_t stride,
+ vcl_size_t batch_num, NumericT sign = NumericT(-1),
+ viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ assert(batch_num != 0 && bool("batch_num must be larger than 0"));
+
+ std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name();
+ if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR)
+ {
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx);
+ program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name();
+ } else
+ viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx);
+
+ vcl_size_t bits_datasize = viennacl::linalg::detail::fft::num_bits(size);
+ if (size <= viennacl::linalg::detail::fft::MAX_LOCAL_POINTS_NUM)
+ {
+ viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_radix2_local");
+ viennacl::ocl::enqueue(k(in,
+ viennacl::ocl::local_mem((size * 4) * sizeof(NumericT)),
+ static_cast<cl_uint>(bits_datasize), static_cast<cl_uint>(size),
+ static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num), sign));
+
+ }
+ else
+ {
+ viennacl::linalg::opencl::reorder<NumericT>(in, size, stride, bits_datasize, batch_num);
+
+ for (vcl_size_t step = 0; step < bits_datasize; step++)
+ {
+ viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_radix2");
+ viennacl::ocl::enqueue(k(in,
+ static_cast<cl_uint>(step), static_cast<cl_uint>(bits_datasize),
+ static_cast<cl_uint>(size), static_cast<cl_uint>(stride),
+ static_cast<cl_uint>(batch_num), sign));
+ }
+ }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently, Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV>& in,
+ viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ vcl_size_t size = in.size() >> 1;
+ vcl_size_t ext_size = viennacl::linalg::detail::fft::next_power_2(2 * size - 1);
+
+ viennacl::vector<NumericT, AlignmentV> A(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> B(ext_size << 1);
+ viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1);
+
+ {
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "zero2");
+ viennacl::ocl::enqueue(k(A, B, static_cast<cl_uint>(ext_size)));
+ }
+ {
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "bluestein_pre");
+ viennacl::ocl::enqueue(k(in, A, B, static_cast<cl_uint>(size), static_cast<cl_uint>(ext_size)));
+ }
+
+ viennacl::linalg::convolve_i(A, B, Z);
+
+ {
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "bluestein_post");
+ viennacl::ocl::enqueue(k(Z, out, static_cast<cl_uint>(size)));
+ }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+ viennacl::vector<NumericT, AlignmentV> const & input2,
+ viennacl::vector<NumericT, AlignmentV> & output)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input1).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+ vcl_size_t size = input1.size() >> 1;
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "fft_mult_vec");
+ viennacl::ocl::enqueue(k(input1, input2, output, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "fft_div_vec_scalar");
+
+ vcl_size_t size = input.size() >> 1;
+ NumericT norm_factor = static_cast<NumericT>(size);
+ viennacl::ocl::enqueue(k(input, static_cast<cl_uint>(size), norm_factor));
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "transpose_inplace");
+ viennacl::ocl::enqueue(k(input, static_cast<cl_uint>(input.internal_size1() >> 1),
+ static_cast<cl_uint>(input.internal_size2()) >> 1));
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input,
+ viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "transpose");
+ viennacl::ocl::enqueue(k(input, output, static_cast<cl_uint>(input.internal_size1() >> 1),
+ static_cast<cl_uint>(input.internal_size2() >> 1)));
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "real_to_complex");
+ viennacl::ocl::enqueue(k(in, out, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+ viennacl::vector_base<NumericT> & out, vcl_size_t size)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "complex_to_real");
+ viennacl::ocl::enqueue(k(in, out, static_cast<cl_uint>(size)));
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT>& in)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context());
+ viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx);
+
+ vcl_size_t size = in.size();
+
+ viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "reverse_inplace");
+ viennacl::ocl::enqueue(k(in, static_cast<cl_uint>(size)));
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* FFT_OPERATIONS_HPP_ */
+
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
new file mode 100644
index 0000000..248a88a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp
@@ -0,0 +1,260 @@
+#ifndef VIENNACL_LINALG_OPENCL_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the PDF manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/opencl/ilu_operations.hpp
+ @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using OpenCL
+*/
+
+#include <cmath>
+#include <algorithm> //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/ilu.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/vector_operations.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+/////////////////////// ICC /////////////////////
+
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ //
+ // Step 1: Count elements in L:
+ //
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_L_1");
+
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+ L.handle1().opencl_handle())
+ );
+
+ //
+ // Step 2: Exclusive scan on row_buffers:
+ //
+ viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), A.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+
+ //
+ // Step 3: Write entries
+ //
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_L_2");
+
+ viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
+ L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle())
+ );
+
+ L.generate_row_block_information();
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ // fill D:
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_1");
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+ // scale L:
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_2");
+ viennacl::ocl::enqueue(k2(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+}
+
+/////////////////////////////////////
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenCL (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ viennacl::backend::mem_handle L_backup;
+ viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+ viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "icc_chow_patel_sweep_kernel");
+ viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), L_backup.opencl_handle(), cl_uint(L.size1()),
+ aij_L)
+ );
+
+}
+
+
+/////////////////////// ILU /////////////////////
+
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ //
+ // Step 1: Count elements in L and U:
+ //
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_LU_1");
+
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()),
+ L.handle1().opencl_handle(),
+ U.handle1().opencl_handle())
+ );
+
+ //
+ // Step 2: Exclusive scan on row_buffers:
+ //
+ viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), A.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer);
+ L.reserve(wrapped_L_row_buffer[L.size1()], false);
+
+ viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), A.size1() + 1, 0, 1);
+ viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer);
+ U.reserve(wrapped_U_row_buffer[U.size1()], false);
+
+ //
+ // Step 3: Write entries
+ //
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_LU_2");
+
+ viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()),
+ L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
+ U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle())
+ );
+
+ L.generate_row_block_information();
+ // Note: block information for U will be generated after transposition
+
+} // extract_LU
+
+///////////////////////////////////////////////
+
+
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+ compressed_matrix<NumericT> & L,
+ compressed_matrix<NumericT> & U)
+{
+ viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A));
+
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ // fill D:
+ viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_1");
+ viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+ // scale L:
+ viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_2");
+ viennacl::ocl::enqueue(k2(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+ // scale U:
+ viennacl::ocl::enqueue(k2(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(), cl_uint(A.size1()), D) );
+
+}
+
+/////////////////////////////////////
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenCL (cf. Algorithm 2 in paper) */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L,
+ vector<NumericT> const & aij_L,
+ compressed_matrix<NumericT> & U_trans,
+ vector<NumericT> const & aij_U_trans)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ viennacl::backend::mem_handle L_backup;
+ viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L));
+ viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size());
+
+ viennacl::backend::mem_handle U_backup;
+ viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans));
+ viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size());
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_chow_patel_sweep_kernel");
+ viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), L_backup.opencl_handle(), cl_uint(L.size1()),
+ aij_L,
+ U_trans.handle1().opencl_handle(), U_trans.handle2().opencl_handle(), U_trans.handle().opencl_handle(), U_backup.opencl_handle(),
+ aij_U_trans)
+ );
+
+}
+
+//////////////////////////////////////
+
+
+
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+ vector<NumericT> & diag_R)
+{
+ viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(R).context());
+ viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+
+ viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_form_neumann_matrix_kernel");
+ viennacl::ocl::enqueue(k(R.handle1().opencl_handle(), R.handle2().opencl_handle(), R.handle().opencl_handle(), cl_uint(R.size1()),
+ diag_R)
+ );
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif