You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:13 UTC
[16/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
new file mode 100644
index 0000000..540ff82
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp
@@ -0,0 +1,33 @@
+#ifndef VIENNACL_LINALG_ILU_HPP_
+#define VIENNACL_LINALG_ILU_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ilu.hpp
+    @brief Implementations of incomplete factorization preconditioners. Convenience header file.
+*/
+
+#include "viennacl/linalg/detail/ilu/ilut.hpp"
+#include "viennacl/linalg/detail/ilu/ilu0.hpp"
+#include "viennacl/linalg/detail/ilu/block_ilu.hpp"
+#include "viennacl/linalg/detail/ilu/chow_patel_ilu.hpp"
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
new file mode 100644
index 0000000..febd347
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp
@@ -0,0 +1,334 @@
+#ifndef VIENNACL_LINALG_ILU_OPERATIONS_HPP_
+#define VIENNACL_LINALG_ILU_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ilu_operations.hpp
+    @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/range.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/ilu_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/ilu_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/ilu_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Extracts the lower triangular part L from A.
+  *
+  * Diagonal of L is stored explicitly in order to enable better code reuse.
+  *
+  */
+template<typename NumericT>
+void extract_L(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::extract_L(A, L);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::extract_L(A, L);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::extract_L(A, L);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L accordingly.
+  *
+  * Since A should not be modified (const-correctness), updates are in L.
+  *
+  */
+template<typename NumericT>
+void icc_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::icc_scale(A, L);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::icc_scale(A, L);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::icc_scale(A, L);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC (cf. Algorithm 3 in paper, but for L rather than U)
+  *
+  * We use a fully synchronous (Jacobi-like) variant, because asynchronous methods as described in the paper are a nightmare to debug
+  * (and particularly funny if they sometimes fail, sometimes not)
+  *
+  * @param L       Factor L to be updated for the incomplete Cholesky factorization
+  * @param aij_L   Lower triangular potion from system matrix
+  */
+template<typename NumericT>
+void icc_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>                  & aij_L)
+{
+  switch (viennacl::traits::handle(L).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::icc_chow_patel_sweep(L, aij_L);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::icc_chow_patel_sweep(L, aij_L);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::icc_chow_patel_sweep(L, aij_L);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+
+//////////////////////// ILU ////////////////////
+
+/** @brief Extracts the lower triangular part L and the upper triangular part U from A.
+  *
+  * Diagonals of L and U are stored explicitly in order to enable better code reuse.
+  *
+  */
+template<typename NumericT>
+void extract_LU(compressed_matrix<NumericT> const & A,
+                compressed_matrix<NumericT>       & L,
+                compressed_matrix<NumericT>       & U)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::extract_LU(A, L, U);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::extract_LU(A, L, U);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::extract_LU(A, L, U);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly.
+  *
+  * Since A should not be modified (const-correctness), updates are in L and U.
+  *
+  */
+template<typename NumericT>
+void ilu_scale(compressed_matrix<NumericT> const & A,
+               compressed_matrix<NumericT>       & L,
+               compressed_matrix<NumericT>       & U)
+{
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::ilu_scale(A, L, U);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::ilu_scale(A, L, U);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::ilu_scale(A, L, U);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Transposition B <- A^T, where the aij-vector is permuted in the same way as the value array in A when assigned to B
+  *
+  * @param A     Input matrix to be transposed
+  * @param B     Output matrix containing the transposed matrix
+  */
+template<typename NumericT>
+void ilu_transpose(compressed_matrix<NumericT> const & A,
+                   compressed_matrix<NumericT>       & B)
+{
+  viennacl::context orig_ctx = viennacl::traits::context(A);
+  viennacl::context cpu_ctx(viennacl::MAIN_MEMORY);
+  (void)orig_ctx;
+  (void)cpu_ctx;
+
+  viennacl::compressed_matrix<NumericT> A_host(0, 0, 0, cpu_ctx);
+  (void)A_host;
+
+  switch (viennacl::traits::handle(A).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::ilu_transpose(A, B);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    A_host = A;
+    B.switch_memory_context(cpu_ctx);
+    viennacl::linalg::host_based::ilu_transpose(A_host, B);
+    B.switch_memory_context(orig_ctx);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    A_host = A;
+    B.switch_memory_context(cpu_ctx);
+    viennacl::linalg::host_based::ilu_transpose(A_host, B);
+    B.switch_memory_context(orig_ctx);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+
+/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU (cf. Algorithm 2 in paper)
+  *
+  * We use a fully synchronous (Jacobi-like) variant, because asynchronous methods as described in the paper are a nightmare to debug
+  * (and particularly funny if they sometimes fail, sometimes not)
+  *
+  * @param L            Lower-triangular matrix L in LU factorization
+  * @param aij_L        Lower-triangular matrix L from A
+  * @param U_trans      Upper-triangular matrix U in CSC-storage, which is the same as U^trans in CSR-storage
+  * @param aij_U_trans  Upper-triangular matrix from A in CSC-storage, which is the same as U^trans in CSR-storage
+  */
+template<typename NumericT>
+void ilu_chow_patel_sweep(compressed_matrix<NumericT>       & L,
+                          vector<NumericT>            const & aij_L,
+                          compressed_matrix<NumericT>       & U_trans,
+                          vector<NumericT>            const & aij_U_trans)
+{
+  switch (viennacl::traits::handle(L).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Extracts the lower triangular part L and the upper triangular part U from A.
+  *
+  * Diagonals of L and U are stored explicitly in order to enable better code reuse.
+  *
+  */
+template<typename NumericT>
+void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R,
+                             vector<NumericT> & diag_R)
+{
+  switch (viennacl::traits::handle(R).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::ilu_form_neumann_matrix(R, diag_R);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::ilu_form_neumann_matrix(R, diag_R);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::ilu_form_neumann_matrix(R, diag_R);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
new file mode 100644
index 0000000..b31a82a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp
@@ -0,0 +1,186 @@
+#ifndef VIENNACL_LINALG_INNER_PROD_HPP_
+#define VIENNACL_LINALG_INNER_PROD_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/inner_prod.hpp
+    @brief Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/tag_of.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+namespace viennacl
+{
+//
+// generic inner_prod function
+//   uses tag dispatch to identify which algorithm
+//   should be called
+//
+namespace linalg
+{
+
+#ifdef VIENNACL_WITH_ARMADILLO
+// ----------------------------------------------------
+// Armadillo
+//
+template<typename NumericT>
+NumericT inner_prod(arma::Col<NumericT> const& v1, arma::Col<NumericT> const& v2)
+{
+  return dot(v1, v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_EIGEN
+// ----------------------------------------------------
+// EIGEN
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                              typename VectorT1::RealScalar>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+  //std::cout << "eigen .. " << std::endl;
+  return v1.dot(v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_MTL4
+// ----------------------------------------------------
+// MTL4
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                              typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+  //std::cout << "mtl4 .. " << std::endl;
+  return mtl::dot(v1, v2);
+}
+#endif
+
+#ifdef VIENNACL_WITH_UBLAS
+// ----------------------------------------------------
+// UBLAS
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                              typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+  //std::cout << "ublas .. " << std::endl;
+  return boost::numeric::ublas::inner_prod(v1, v2);
+}
+#endif
+
+// ----------------------------------------------------
+// STL
+//
+template<typename VectorT1, typename VectorT2>
+typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value,
+                              typename VectorT1::value_type>::type
+inner_prod(VectorT1 const & v1, VectorT2 const & v2)
+{
+  assert(v1.size() == v2.size() && bool("Vector sizes mismatch"));
+  //std::cout << "stl .. " << std::endl;
+  typename VectorT1::value_type result = 0;
+  for (typename VectorT1::size_type i=0; i<v1.size(); ++i)
+    result += v1[i] * v2[i];
+
+  return result;
+}
+
+// ----------------------------------------------------
+// VIENNACL
+//
+template<typename NumericT>
+viennacl::scalar_expression< const vector_base<NumericT>, const vector_base<NumericT>, viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & vector1,
+           vector_base<NumericT> const & vector2)
+{
+  //std::cout << "viennacl .. " << std::endl;
+  return viennacl::scalar_expression< const vector_base<NumericT>,
+                                      const vector_base<NumericT>,
+                                      viennacl::op_inner_prod >(vector1, vector2);
+}
+
+
+// expression on lhs:
+template< typename LHS, typename RHS, typename OP, typename NumericT>
+viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+                             const vector_base<NumericT>,
+                             viennacl::op_inner_prod >
+inner_prod(viennacl::vector_expression<LHS, RHS, OP> const & vector1,
+           vector_base<NumericT> const & vector2)
+{
+  //std::cout << "viennacl .. " << std::endl;
+  return viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>,
+                                      const vector_base<NumericT>,
+                                      viennacl::op_inner_prod >(vector1, vector2);
+}
+
+// expression on rhs:
+template<typename NumericT, typename LHS, typename RHS, typename OP>
+viennacl::scalar_expression< const vector_base<NumericT>,
+                             const viennacl::vector_expression<LHS, RHS, OP>,
+                             viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & vector1,
+           viennacl::vector_expression<LHS, RHS, OP> const & vector2)
+{
+  //std::cout << "viennacl .. " << std::endl;
+  return viennacl::scalar_expression< const vector_base<NumericT>,
+                                      const viennacl::vector_expression<LHS, RHS, OP>,
+                                      viennacl::op_inner_prod >(vector1, vector2);
+}
+
+// expression on lhs and rhs:
+template<typename LHS1, typename RHS1, typename OP1,
+         typename LHS2, typename RHS2, typename OP2>
+viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+                             const viennacl::vector_expression<LHS2, RHS2, OP2>,
+                             viennacl::op_inner_prod >
+inner_prod(viennacl::vector_expression<LHS1, RHS1, OP1> const & vector1,
+           viennacl::vector_expression<LHS2, RHS2, OP2> const & vector2)
+{
+  //std::cout << "viennacl .. " << std::endl;
+  return viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>,
+                                      const viennacl::vector_expression<LHS2, RHS2, OP2>,
+                                      viennacl::op_inner_prod >(vector1, vector2);
+}
+
+
+// Multiple inner products:
+template<typename NumericT>
+viennacl::vector_expression< const vector_base<NumericT>, const vector_tuple<NumericT>, viennacl::op_inner_prod >
+inner_prod(vector_base<NumericT> const & x,
+           vector_tuple<NumericT> const & y_tuple)
+{
+  return viennacl::vector_expression< const vector_base<NumericT>,
+                                      const vector_tuple<NumericT>,
+                                      viennacl::op_inner_prod >(x, y_tuple);
+}
+
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
new file mode 100644
index 0000000..78a813d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp
@@ -0,0 +1,425 @@
+#ifndef VIENNACL_LINALG_ITERATIVE_OPERATIONS_HPP_
+#define VIENNACL_LINALG_ITERATIVE_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/iterative_operations.hpp
+    @brief Implementations of specialized routines for the iterative solvers.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/range.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/host_based/iterative_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/iterative_operations.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/iterative_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for vectors 'result', 'p', 'r', 'Ap':
+  *   result += alpha * p;
+  *   r      -= alpha * Ap;
+  *   p       = r + beta * p;
+  * and runs the parallel reduction stage for computing inner_prod(r,r)
+  */
+template<typename NumericT>
+void pipelined_cg_vector_update(vector_base<NumericT> & result,
+                                NumericT alpha,
+                                vector_base<NumericT> & p,
+                                vector_base<NumericT> & r,
+                                vector_base<NumericT> const & Ap,
+                                NumericT beta,
+                                vector_base<NumericT> & inner_prod_buffer)
+{
+  switch (viennacl::traits::handle(result).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename MatrixT, typename NumericT>
+void pipelined_cg_prod(MatrixT const & A,
+                       vector_base<NumericT> const & p,
+                       vector_base<NumericT> & Ap,
+                       vector_base<NumericT> & inner_prod_buffer)
+{
+  switch (viennacl::traits::handle(p).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_cg_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+////////////////////////////////////////////
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for vectors 's', 'r', 'Ap':
+  *   s = r - alpha * Ap
+  * with alpha obtained from a reduction step on the 0th and the 3rd out of 6 chunks in inner_prod_buffer
+  * and runs the parallel reduction stage for computing inner_prod(s,s)
+  */
+template<typename NumericT>
+void pipelined_bicgstab_update_s(vector_base<NumericT> & s,
+                                 vector_base<NumericT> & r,
+                                 vector_base<NumericT> const & Ap,
+                                 vector_base<NumericT> & inner_prod_buffer,
+                                 vcl_size_t buffer_chunk_size,
+                                 vcl_size_t buffer_chunk_offset)
+{
+  switch (viennacl::traits::handle(s).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm.
+  *
+  * x_{j+1} = x_j + alpha * p_j + omega * s_j
+  * r_{j+1} = s_j - omega * t_j
+  * p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
+  * and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
+  */
+template<typename NumericT>
+void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
+                                      vector_base<NumericT> & residual, vector_base<NumericT> const & As,
+                                      NumericT beta, vector_base<NumericT> const & Ap,
+                                      vector_base<NumericT> const & r0star,
+                                      vector_base<NumericT> & inner_prod_buffer,
+                                      vcl_size_t buffer_chunk_size)
+{
+  switch (viennacl::traits::handle(s).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+    break;
+  #ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+    break;
+  #endif
+  #ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size);
+    break;
+  #endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template<typename MatrixT, typename NumericT>
+void pipelined_bicgstab_prod(MatrixT const & A,
+                             vector_base<NumericT> const & p,
+                             vector_base<NumericT> & Ap,
+                             vector_base<NumericT> const & r0star,
+                             vector_base<NumericT> & inner_prod_buffer,
+                             vcl_size_t buffer_chunk_size,
+                             vcl_size_t buffer_chunk_offset)
+{
+  switch (viennacl::traits::handle(p).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+////////////////////////////////////////////
+
+/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
+  *
+  * This routines computes for vectors 'r', 'v_k':
+  *   Second reduction step for ||v_k||
+  *   v_k /= ||v_k||
+  *   First reduction step for <r, v_k>
+  */
+template <typename T>
+void pipelined_gmres_normalize_vk(vector_base<T> & v_k,
+                                  vector_base<T> const & residual,
+                                  vector_base<T> & R_buffer,
+                                  vcl_size_t offset_in_R,
+                                  vector_base<T> const & inner_prod_buffer,
+                                  vector_base<T> & r_dot_vk_buffer,
+                                  vcl_size_t buffer_chunk_size,
+                                  vcl_size_t buffer_chunk_offset)
+{
+  switch (viennacl::traits::handle(v_k).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+
+/** @brief Computes the first reduction stage for multiple inner products <v_i, v_k>, i=0..k-1
+  *
+  *  All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+  */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
+                                         vcl_size_t v_k_size,
+                                         vcl_size_t v_k_internal_size,
+                                         vcl_size_t k,
+                                         vector_base<T> & vi_in_vk_buffer,
+                                         vcl_size_t buffer_chunk_size)
+{
+  switch (viennacl::traits::handle(device_krylov_basis).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+/** @brief Computes the second reduction stage for multiple inner products <v_i, v_k>, i=0..k-1, then updates v_k -= <v_i, v_k> v_i and computes the first reduction stage for ||v_k||
+  *
+  *  All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size'
+  */
+template <typename T>
+void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis,
+                                         vcl_size_t v_k_size,
+                                         vcl_size_t v_k_internal_size,
+                                         vcl_size_t k,
+                                         vector_base<T> const & vi_in_vk_buffer,
+                                         vector_base<T> & R_buffer,
+                                         vcl_size_t krylov_dim,
+                                         vector_base<T> & inner_prod_buffer,
+                                         vcl_size_t buffer_chunk_size)
+{
+  switch (viennacl::traits::handle(device_krylov_basis).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+/** @brief Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1} */
+template <typename T>
+void pipelined_gmres_update_result(vector_base<T> & result,
+                                   vector_base<T> const & residual,
+                                   vector_base<T> const & krylov_basis,
+                                   vcl_size_t v_k_size,
+                                   vcl_size_t v_k_internal_size,
+                                   vector_base<T> const & coefficients,
+                                   vcl_size_t k)
+{
+  switch (viennacl::traits::handle(result).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/** @brief Performs a joint vector update operation needed for an efficient pipelined GMRES algorithm.
+  *
+  * This routines computes for a matrix A and vectors 'p' and 'Ap':
+  *   Ap = prod(A, p);
+  * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap)
+  */
+template <typename MatrixType, typename T>
+void pipelined_gmres_prod(MatrixType const & A,
+                       vector_base<T> const & p,
+                       vector_base<T> & Ap,
+                       vector_base<T> & inner_prod_buffer)
+{
+  switch (viennacl::traits::handle(p).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer);
+    break;
+#endif
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
new file mode 100644
index 0000000..0b16964
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp
@@ -0,0 +1,141 @@
+#ifndef VIENNACL_LINALG_JACOBI_PRECOND_HPP_
+#define VIENNACL_LINALG_JACOBI_PRECOND_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/jacobi_precond.hpp
+    @brief Implementation of a simple Jacobi preconditioner
+*/
+
+#include <vector>
+#include <cmath>
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/row_scaling.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for a jacobi preconditioner
+*/
+class jacobi_tag {};
+
+
+/** @brief Jacobi preconditioner class, can be supplied to solve()-routines. Generic version for non-ViennaCL matrices.
+*/
+template<typename MatrixT,
+          bool is_viennacl = detail::row_scaling_for_viennacl<MatrixT>::value >
+class jacobi_precond
+{
+  typedef typename MatrixT::value_type      NumericType;
+
+  public:
+    jacobi_precond(MatrixT const & mat, jacobi_tag const &) : diag_A_(viennacl::traits::size1(mat))
+    {
+      init(mat);
+    }
+
+    void init(MatrixT const & mat)
+    {
+      diag_A_.resize(viennacl::traits::size1(mat));  //resize without preserving values
+
+      for (typename MatrixT::const_iterator1 row_it = mat.begin1();
+            row_it != mat.end1();
+            ++row_it)
+      {
+        bool diag_found = false;
+        for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+              col_it != row_it.end();
+              ++col_it)
+        {
+          if (col_it.index1() == col_it.index2())
+          {
+            diag_A_[col_it.index1()] = *col_it;
+            diag_found = true;
+          }
+        }
+        if (!diag_found)
+          throw zero_on_diagonal_exception("ViennaCL: Zero in diagonal encountered while setting up Jacobi preconditioner!");
+      }
+    }
+
+
+    /** @brief Apply to res = b - Ax, i.e. jacobi applied vec (right hand side),  */
+    template<typename VectorT>
+    void apply(VectorT & vec) const
+    {
+      assert(viennacl::traits::size(diag_A_) == viennacl::traits::size(vec) && bool("Size mismatch"));
+      for (vcl_size_t i=0; i<diag_A_.size(); ++i)
+        vec[i] /= diag_A_[i];
+    }
+
+  private:
+    std::vector<NumericType> diag_A_;
+};
+
+
+/** @brief Jacobi preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename MatrixT>
+class jacobi_precond<MatrixT, true>
+{
+    typedef typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type  NumericType;
+
+  public:
+    jacobi_precond(MatrixT const & mat, jacobi_tag const &) : diag_A_(mat.size1(), viennacl::traits::context(mat))
+    {
+      init(mat);
+    }
+
+
+    void init(MatrixT const & mat)
+    {
+      detail::row_info(mat, diag_A_, detail::SPARSE_ROW_DIAGONAL);
+    }
+
+
+    template<unsigned int AlignmentV>
+    void apply(viennacl::vector<NumericType, AlignmentV> & vec) const
+    {
+      assert(viennacl::traits::size(diag_A_) == viennacl::traits::size(vec) && bool("Size mismatch"));
+      vec = element_div(vec, diag_A_);
+    }
+
+  private:
+    viennacl::vector<NumericType> diag_A_;
+};
+
+}
+}
+
+
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
new file mode 100644
index 0000000..ffac471
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp
@@ -0,0 +1,515 @@
+#ifndef VIENNACL_LINALG_LANCZOS_HPP_
+#define VIENNACL_LINALG_LANCZOS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/lanczos.hpp
+*   @brief Generic interface for the Lanczos algorithm.
+*
+*   Contributed by Guenther Mader and Astrid Rupp.
+*/
+
+#include <cmath>
+#include <vector>
+#include "viennacl/vector.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/norm_2.hpp"
+#include "viennacl/io/matrix_market.hpp"
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/tools/random.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for the lanczos algorithm.
+*/
+class lanczos_tag
+{
+public:
+
+  enum
+  {
+    partial_reorthogonalization = 0,
+    full_reorthogonalization,
+    no_reorthogonalization
+  };
+
+  /** @brief The constructor
+  *
+  * @param factor                 Exponent of epsilon - tolerance for batches of Reorthogonalization
+  * @param numeig                 Number of eigenvalues to be returned
+  * @param met                    Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization
+  * @param krylov                 Maximum krylov-space size
+  */
+
+  lanczos_tag(double factor = 0.75,
+              vcl_size_t numeig = 10,
+              int met = 0,
+              vcl_size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {}
+
+  /** @brief Sets the number of eigenvalues */
+  void num_eigenvalues(vcl_size_t numeig){ num_eigenvalues_ = numeig; }
+
+    /** @brief Returns the number of eigenvalues */
+  vcl_size_t num_eigenvalues() const { return num_eigenvalues_; }
+
+    /** @brief Sets the exponent of epsilon. Values between 0.6 and 0.9 usually give best results. */
+  void factor(double fct) { factor_ = fct; }
+
+  /** @brief Returns the exponent */
+  double factor() const { return factor_; }
+
+  /** @brief Sets the size of the kylov space. Must be larger than number of eigenvalues to compute. */
+  void krylov_size(vcl_size_t max) { krylov_size_ = max; }
+
+  /** @brief Returns the size of the kylov space */
+  vcl_size_t  krylov_size() const { return krylov_size_; }
+
+  /** @brief Sets the reorthogonalization method */
+  void method(int met){ method_ = met; }
+
+  /** @brief Returns the reorthogonalization method */
+  int method() const { return method_; }
+
+
+private:
+  double factor_;
+  vcl_size_t num_eigenvalues_;
+  int method_; // see enum defined above for possible values
+  vcl_size_t krylov_size_;
+};
+
+
+namespace detail
+{
+  /** @brief Inverse iteration for finding an eigenvector for an eigenvalue.
+   *
+   *  beta[0] to be ignored for consistency.
+   */
+  template<typename NumericT>
+  void inverse_iteration(std::vector<NumericT> const & alphas, std::vector<NumericT> const & betas,
+                         NumericT & eigenvalue, std::vector<NumericT> & eigenvector)
+  {
+    std::vector<NumericT> alpha_sweeped = alphas;
+    for (vcl_size_t i=0; i<alpha_sweeped.size(); ++i)
+      alpha_sweeped[i] -= eigenvalue;
+    for (vcl_size_t row=1; row < alpha_sweeped.size(); ++row)
+      alpha_sweeped[row] -= betas[row] * betas[row] / alpha_sweeped[row-1];
+
+    // starting guess: ignore last equation
+    eigenvector[alphas.size() - 1] = 1.0;
+
+    for (vcl_size_t iter=0; iter<1; ++iter)
+    {
+      // solve first n-1 equations (A - \lambda I) y = -beta[n]
+      eigenvector[alphas.size() - 1] /= alpha_sweeped[alphas.size() - 1];
+      for (vcl_size_t row2=1; row2 < alphas.size(); ++row2)
+      {
+        vcl_size_t row = alphas.size() - row2 - 1;
+        eigenvector[row] -= eigenvector[row+1] * betas[row+1];
+        eigenvector[row] /= alpha_sweeped[row];
+      }
+
+      // normalize eigenvector:
+      NumericT norm_vector = 0;
+      for (vcl_size_t i=0; i<eigenvector.size(); ++i)
+        norm_vector += eigenvector[i] * eigenvector[i];
+      norm_vector = std::sqrt(norm_vector);
+      for (vcl_size_t i=0; i<eigenvector.size(); ++i)
+        eigenvector[i] /= norm_vector;
+    }
+
+    //eigenvalue = (alphas[0] * eigenvector[0] + betas[1] * eigenvector[1]) / eigenvector[0];
+  }
+
+  /**
+  *   @brief Implementation of the Lanczos PRO algorithm (partial reorthogonalization)
+  *
+  *   @param A              The system matrix
+  *   @param r              Random start vector
+  *   @param eigenvectors_A Dense matrix holding the eigenvectors of A (one eigenvector per column)
+  *   @param size           Size of krylov-space
+  *   @param tag            Lanczos_tag with several options for the algorithm
+  *   @param compute_eigenvectors   Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+  *   @return               Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+  */
+
+  template<typename MatrixT, typename DenseMatrixT, typename NumericT>
+  std::vector<NumericT>
+  lanczosPRO (MatrixT const& A, vector_base<NumericT> & r, DenseMatrixT & eigenvectors_A, vcl_size_t size, lanczos_tag const & tag, bool compute_eigenvectors)
+  {
+    // generation of some random numbers, used for lanczos PRO algorithm
+    viennacl::tools::normal_random_numbers<NumericT> get_N;
+
+    std::vector<vcl_size_t> l_bound(size/2), u_bound(size/2);
+    vcl_size_t n = r.size();
+    std::vector<NumericT> w(size), w_old(size); //w_k, w_{k-1}
+
+    NumericT inner_rt;
+    std::vector<NumericT> alphas, betas;
+    viennacl::matrix<NumericT, viennacl::column_major> Q(n, size); //column-major matrix holding the Krylov basis vectors
+
+    bool second_step = false;
+    NumericT eps = std::numeric_limits<NumericT>::epsilon();
+    NumericT squ_eps = std::sqrt(eps);
+    NumericT eta = std::exp(std::log(eps) * tag.factor());
+
+    NumericT beta = viennacl::linalg::norm_2(r);
+
+    r /= beta;
+
+    viennacl::vector_base<NumericT> q_0(Q.handle(), Q.size1(), 0, 1);
+    q_0 = r;
+
+    viennacl::vector<NumericT> u = viennacl::linalg::prod(A, r);
+    NumericT alpha = viennacl::linalg::inner_prod(u, r);
+    alphas.push_back(alpha);
+    w[0] = 1;
+    betas.push_back(beta);
+
+    vcl_size_t batches = 0;
+    for (vcl_size_t i = 1; i < size; i++) // Main loop for setting up the Krylov space
+    {
+      viennacl::vector_base<NumericT> q_iminus1(Q.handle(), Q.size1(), (i-1) * Q.internal_size1(), 1);
+      r = u - alpha * q_iminus1;
+      beta = viennacl::linalg::norm_2(r);
+
+      betas.push_back(beta);
+      r = r / beta;
+
+      //
+      // Update recurrence relation for estimating orthogonality loss
+      //
+      w_old = w;
+      w[0] = (betas[1] * w_old[1] + (alphas[0] - alpha) * w_old[0] - betas[i - 1] * w_old[0]) / beta + eps * 0.3 * get_N() * (betas[1] + beta);
+      for (vcl_size_t j = 1; j < i - 1; j++)
+        w[j] = (betas[j + 1] * w_old[j + 1] + (alphas[j] - alpha) * w_old[j] + betas[j] * w_old[j - 1] - betas[i - 1] * w_old[j]) / beta + eps * 0.3 * get_N() * (betas[j + 1] + beta);
+      w[i-1] = 0.6 * eps * NumericT(n) * get_N() * betas[1] / beta;
+
+      //
+      // Check whether there has been a need for reorthogonalization detected in the previous iteration.
+      // If so, run the reorthogonalization for each batch
+      //
+      if (second_step)
+      {
+        for (vcl_size_t j = 0; j < batches; j++)
+        {
+          for (vcl_size_t k = l_bound[j] + 1; k < u_bound[j] - 1; k++)
+          {
+            viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+            inner_rt = viennacl::linalg::inner_prod(r, q_k);
+            r = r - inner_rt * q_k;
+            w[k] = 1.5 * eps * get_N();
+          }
+        }
+        NumericT temp = viennacl::linalg::norm_2(r);
+        r = r / temp;
+        beta = beta * temp;
+        second_step = false;
+      }
+      batches = 0;
+
+      //
+      // Check for semiorthogonality
+      //
+      for (vcl_size_t j = 0; j < i; j++)
+      {
+        if (std::fabs(w[j]) >= squ_eps) // tentative loss of orthonormality, hence reorthonomalize
+        {
+          viennacl::vector_base<NumericT> q_j(Q.handle(), Q.size1(), j * Q.internal_size1(), 1);
+          inner_rt = viennacl::linalg::inner_prod(r, q_j);
+          r = r - inner_rt * q_j;
+          w[j] = 1.5 * eps * get_N();
+          vcl_size_t k = j - 1;
+
+          // orthogonalization with respect to earlier basis vectors
+          while (std::fabs(w[k]) > eta)
+          {
+            viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+            inner_rt = viennacl::linalg::inner_prod(r, q_k);
+            r = r - inner_rt * q_k;
+            w[k] = 1.5 * eps * get_N();
+            if (k == 0) break;
+            k--;
+          }
+          l_bound[batches] = k;
+
+          // orthogonalization with respect to later basis vectors
+          k = j + 1;
+          while (k < i && std::fabs(w[k]) > eta)
+          {
+            viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1);
+            inner_rt = viennacl::linalg::inner_prod(r, q_k);
+            r = r - inner_rt * q_k;
+            w[k] = 1.5 * eps * get_N();
+            k++;
+          }
+          u_bound[batches] = k - 1;
+          batches++;
+
+          j = k-1; // go to end of current batch
+        }
+      }
+
+      //
+      // Normalize basis vector and reorthogonalize as needed
+      //
+      if (batches > 0)
+      {
+        NumericT temp = viennacl::linalg::norm_2(r);
+        r = r / temp;
+        beta = beta * temp;
+        second_step = true;
+      }
+
+      // store Krylov vector in Q:
+      viennacl::vector_base<NumericT> q_i(Q.handle(), Q.size1(), i * Q.internal_size1(), 1);
+      q_i = r;
+
+      //
+      // determine and store alpha = <r, u> with u = A q_i - beta q_{i-1}
+      //
+      u = viennacl::linalg::prod(A, r);
+      u += (-beta) * q_iminus1;
+      alpha = viennacl::linalg::inner_prod(u, r);
+      alphas.push_back(alpha);
+    }
+
+    //
+    // Step 2: Compute eigenvalues of tridiagonal matrix obtained during Lanczos iterations:
+    //
+    std::vector<NumericT> eigenvalues = bisect(alphas, betas);
+
+    //
+    // Step 3: Compute eigenvectors via inverse iteration. Does not update eigenvalues, so only approximate by nature.
+    //
+    if (compute_eigenvectors)
+    {
+      std::vector<NumericT> eigenvector_tridiag(alphas.size());
+      for (std::size_t i=0; i < tag.num_eigenvalues(); ++i)
+      {
+        // compute eigenvector of tridiagonal matrix via inverse:
+        inverse_iteration(alphas, betas, eigenvalues[eigenvalues.size() - i - 1], eigenvector_tridiag);
+
+        // eigenvector w of full matrix A. Given as w = Q * u, where u is the eigenvector of the tridiagonal matrix
+        viennacl::vector<NumericT> eigenvector_u(eigenvector_tridiag.size());
+        viennacl::copy(eigenvector_tridiag, eigenvector_u);
+
+        viennacl::vector_base<NumericT> eigenvector_A(eigenvectors_A.handle(),
+                                                      eigenvectors_A.size1(),
+                                                      eigenvectors_A.row_major() ? i : i * eigenvectors_A.internal_size1(),
+                                                      eigenvectors_A.row_major() ? eigenvectors_A.internal_size2() : 1);
+        eigenvector_A = viennacl::linalg::prod(project(Q,
+                                                       range(0, Q.size1()),
+                                                       range(0, eigenvector_u.size())),
+                                               eigenvector_u);
+      }
+    }
+
+    return eigenvalues;
+  }
+
+
+  /**
+  *   @brief Implementation of the Lanczos FRO algorithm
+  *
+  *   @param A            The system matrix
+  *   @param r            Random start vector
+  *   @param eigenvectors_A  A dense matrix in which the eigenvectors of A will be stored. Both row- and column-major matrices are supported.
+  *   @param krylov_dim   Size of krylov-space
+  *   @param tag          The Lanczos tag holding tolerances, etc.
+  *   @param compute_eigenvectors   Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+  *   @return             Returns the eigenvalues (number of eigenvalues equals size of krylov-space)
+  */
+  template< typename MatrixT, typename DenseMatrixT, typename NumericT>
+  std::vector<NumericT>
+  lanczos(MatrixT const& A, vector_base<NumericT> & r, DenseMatrixT & eigenvectors_A, vcl_size_t krylov_dim, lanczos_tag const & tag, bool compute_eigenvectors)
+  {
+    std::vector<NumericT> alphas, betas;
+    viennacl::vector<NumericT> Aq(r.size());
+    viennacl::matrix<NumericT, viennacl::column_major> Q(r.size(), krylov_dim + 1);  // Krylov basis (each Krylov vector is one column)
+
+    NumericT norm_r = norm_2(r);
+    NumericT beta = norm_r;
+    r /= norm_r;
+
+    // first Krylov vector:
+    viennacl::vector_base<NumericT> q0(Q.handle(), Q.size1(), 0, 1);
+    q0 = r;
+
+    //
+    // Step 1: Run Lanczos' method to obtain tridiagonal matrix
+    //
+    for (vcl_size_t i = 0; i < krylov_dim; i++)
+    {
+      betas.push_back(beta);
+      // last available vector from Krylov basis:
+      viennacl::vector_base<NumericT> q_i(Q.handle(), Q.size1(), i * Q.internal_size1(), 1);
+
+      // Lanczos algorithm:
+      // - Compute A * q:
+      Aq = viennacl::linalg::prod(A, q_i);
+
+      // - Form Aq <- Aq - <Aq, q_i> * q_i - beta * q_{i-1}, where beta is ||q_i|| before normalization in previous iteration
+      NumericT alpha = viennacl::linalg::inner_prod(Aq, q_i);
+      Aq -= alpha * q_i;
+
+      if (i > 0)
+      {
+        viennacl::vector_base<NumericT> q_iminus1(Q.handle(), Q.size1(), (i-1) * Q.internal_size1(), 1);
+        Aq -= beta * q_iminus1;
+
+        // Extra measures for improved numerical stability?
+        if (tag.method() == lanczos_tag::full_reorthogonalization)
+        {
+          // Gram-Schmidt (re-)orthogonalization:
+          // TODO: Reuse fast (pipelined) routines from GMRES or GEMV
+          for (vcl_size_t j = 0; j < i; j++)
+          {
+            viennacl::vector_base<NumericT> q_j(Q.handle(), Q.size1(), j * Q.internal_size1(), 1);
+            NumericT inner_rq = viennacl::linalg::inner_prod(Aq, q_j);
+            Aq -= inner_rq * q_j;
+          }
+        }
+      }
+
+      // normalize Aq and add to Krylov basis at column i+1 in Q:
+      beta = viennacl::linalg::norm_2(Aq);
+      viennacl::vector_base<NumericT> q_iplus1(Q.handle(), Q.size1(), (i+1) * Q.internal_size1(), 1);
+      q_iplus1 = Aq / beta;
+
+      alphas.push_back(alpha);
+    }
+
+    //
+    // Step 2: Compute eigenvalues of tridiagonal matrix obtained during Lanczos iterations:
+    //
+    std::vector<NumericT> eigenvalues = bisect(alphas, betas);
+
+    //
+    // Step 3: Compute eigenvectors via inverse iteration. Does not update eigenvalues, so only approximate by nature.
+    //
+    if (compute_eigenvectors)
+    {
+      std::vector<NumericT> eigenvector_tridiag(alphas.size());
+      for (std::size_t i=0; i < tag.num_eigenvalues(); ++i)
+      {
+        // compute eigenvector of tridiagonal matrix via inverse:
+        inverse_iteration(alphas, betas, eigenvalues[eigenvalues.size() - i - 1], eigenvector_tridiag);
+
+        // eigenvector w of full matrix A. Given as w = Q * u, where u is the eigenvector of the tridiagonal matrix
+        viennacl::vector<NumericT> eigenvector_u(eigenvector_tridiag.size());
+        viennacl::copy(eigenvector_tridiag, eigenvector_u);
+
+        viennacl::vector_base<NumericT> eigenvector_A(eigenvectors_A.handle(),
+                                                      eigenvectors_A.size1(),
+                                                      eigenvectors_A.row_major() ? i : i * eigenvectors_A.internal_size1(),
+                                                      eigenvectors_A.row_major() ? eigenvectors_A.internal_size2() : 1);
+        eigenvector_A = viennacl::linalg::prod(project(Q,
+                                                       range(0, Q.size1()),
+                                                       range(0, eigenvector_u.size())),
+                                               eigenvector_u);
+      }
+    }
+
+    return eigenvalues;
+  }
+
+} // end namespace detail
+
+/**
+*   @brief Implementation of the calculation of eigenvalues using lanczos (with and without reorthogonalization).
+*
+*   Implementation of Lanczos with partial reorthogonalization is implemented separately.
+*
+*   @param matrix          The system matrix
+*   @param eigenvectors_A  A dense matrix in which the eigenvectors of A will be stored. Both row- and column-major matrices are supported.
+*   @param tag             Tag with several options for the lanczos algorithm
+*   @param compute_eigenvectors   Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues.
+*   @return                Returns the n largest eigenvalues (n defined in the lanczos_tag)
+*/
+template<typename MatrixT, typename DenseMatrixT>
+std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type >
+eig(MatrixT const & matrix, DenseMatrixT & eigenvectors_A, lanczos_tag const & tag, bool compute_eigenvectors = true)
+{
+  typedef typename viennacl::result_of::value_type<MatrixT>::type           NumericType;
+  typedef typename viennacl::result_of::cpu_value_type<NumericType>::type   CPU_NumericType;
+  typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type    VectorT;
+
+  viennacl::tools::uniform_random_numbers<CPU_NumericType> random_gen;
+
+  std::vector<CPU_NumericType> eigenvalues;
+  vcl_size_t matrix_size = matrix.size1();
+  VectorT r(matrix_size);
+  std::vector<CPU_NumericType> s(matrix_size);
+
+  for (vcl_size_t i=0; i<s.size(); ++i)
+    s[i] = CPU_NumericType(0.5) + random_gen();
+
+  detail::copy_vec_to_vec(s,r);
+
+  vcl_size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size
+                                                              : tag.krylov_size();
+
+  switch (tag.method())
+  {
+  case lanczos_tag::partial_reorthogonalization:
+    eigenvalues = detail::lanczosPRO(matrix, r, eigenvectors_A, size_krylov, tag, compute_eigenvectors);
+    break;
+  case lanczos_tag::full_reorthogonalization:
+  case lanczos_tag::no_reorthogonalization:
+    eigenvalues = detail::lanczos(matrix, r, eigenvectors_A, size_krylov, tag, compute_eigenvectors);
+    break;
+  }
+
+  std::vector<CPU_NumericType> largest_eigenvalues;
+
+  for (vcl_size_t i = 1; i<=tag.num_eigenvalues(); i++)
+    largest_eigenvalues.push_back(eigenvalues[size_krylov-i]);
+
+
+  return largest_eigenvalues;
+}
+
+
+/**
+*   @brief Implementation of the calculation of eigenvalues using lanczos (with and without reorthogonalization).
+*
+*   Implementation of Lanczos with partial reorthogonalization is implemented separately.
+*
+*   @param matrix        The system matrix
+*   @param tag           Tag with several options for the lanczos algorithm
+*   @return              Returns the n largest eigenvalues (n defined in the lanczos_tag)
+*/
+template<typename MatrixT>
+std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type >
+eig(MatrixT const & matrix, lanczos_tag const & tag)
+{
+  typedef typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type  NumericType;
+
+  viennacl::matrix<NumericType> eigenvectors(matrix.size1(), tag.num_eigenvalues());
+  return eig(matrix, eigenvectors, tag, false);
+}
+
+} // end namespace linalg
+} // end namespace viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
new file mode 100644
index 0000000..0bdd037
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp
@@ -0,0 +1,227 @@
+#ifndef VIENNACL_LINALG_LU_HPP
+#define VIENNACL_LINALG_LU_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/lu.hpp
+    @brief Implementations of LU factorization for row-major and column-major dense matrices.
+*/
+
+#include <algorithm>    //for std::min
+
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/direct_solve.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+/** @brief LU factorization of a row-major dense matrix.
+*
+* @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+*/
+template<typename NumericT>
+void lu_factorize(matrix<NumericT, viennacl::row_major> & A)
+{
+  typedef matrix<NumericT, viennacl::row_major>  MatrixType;
+
+  vcl_size_t max_block_size = 32;
+  vcl_size_t num_blocks = (A.size2() - 1) / max_block_size + 1;
+  std::vector<NumericT> temp_buffer(A.internal_size2() * max_block_size);
+
+  // Iterate over panels
+  for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id)
+  {
+    vcl_size_t row_start = panel_id * max_block_size;
+    vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - row_start, max_block_size);
+
+    viennacl::range     block_range(row_start, row_start + current_block_size);
+    viennacl::range remainder_range(row_start + current_block_size, A.size1());
+
+    //
+    // Perform LU factorization on panel:
+    //
+
+
+    // Read from matrix to buffer:
+    viennacl::backend::memory_read(A.handle(),
+                                   sizeof(NumericT) * row_start          * A.internal_size2(),
+                                   sizeof(NumericT) * current_block_size * A.internal_size2(),
+                                   &(temp_buffer[0]));
+
+    // Factorize (kij-version):
+    for (vcl_size_t k=0; k < current_block_size - 1; ++k)
+    {
+      for (vcl_size_t i=k+1; i < current_block_size; ++i)
+      {
+        temp_buffer[row_start + i * A.internal_size2() + k] /= temp_buffer[row_start + k * A.internal_size2() + k];  // write l_ik
+
+        NumericT l_ik = temp_buffer[row_start + i * A.internal_size2() + k];
+
+        for (vcl_size_t j = row_start + k + 1; j < A.size1(); ++j)
+          temp_buffer[i * A.internal_size2() + j] -= l_ik * temp_buffer[k * A.internal_size2() + j];  // l_ik * a_kj
+      }
+    }
+
+    // Write back:
+    viennacl::backend::memory_write(A.handle(),
+                                    sizeof(NumericT) * row_start          * A.internal_size2(),
+                                    sizeof(NumericT) * current_block_size * A.internal_size2(),
+                                    &(temp_buffer[0]));
+
+    if (remainder_range.size() > 0)
+    {
+      //
+      // Compute L_12 = [ (U_11)^{T}^{-1} A_{21}^T ]^T
+      //
+      viennacl::matrix_range<MatrixType> U_11(A, block_range,     block_range);
+      viennacl::matrix_range<MatrixType> A_21(A, remainder_range, block_range);
+      viennacl::linalg::inplace_solve(trans(U_11), trans(A_21), viennacl::linalg::lower_tag());
+
+      //
+      // Update remainder of A
+      //
+      viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range);
+      viennacl::matrix_range<MatrixType> U_12(A, block_range,     remainder_range);
+      viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range);
+
+      A_22 -= viennacl::linalg::prod(L_21, U_12);
+    }
+  }
+
+}
+
+
+/** @brief LU factorization of a column-major dense matrix.
+*
+* @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+*/
+template<typename NumericT>
+void lu_factorize(matrix<NumericT, viennacl::column_major> & A)
+{
+  typedef matrix<NumericT, viennacl::column_major>  MatrixType;
+
+  vcl_size_t max_block_size = 32;
+  vcl_size_t num_blocks = (A.size1() - 1) / max_block_size + 1;
+  std::vector<NumericT> temp_buffer(A.internal_size1() * max_block_size);
+
+  // Iterate over panels
+  for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id)
+  {
+    vcl_size_t col_start = panel_id * max_block_size;
+    vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - col_start, max_block_size);
+
+    viennacl::range     block_range(col_start, col_start + current_block_size);
+    viennacl::range remainder_range(col_start + current_block_size, A.size1());
+
+    //
+    // Perform LU factorization on panel:
+    //
+
+
+    // Read from matrix to buffer:
+    viennacl::backend::memory_read(A.handle(),
+                                   sizeof(NumericT) * col_start          * A.internal_size1(),
+                                   sizeof(NumericT) * current_block_size * A.internal_size1(),
+                                   &(temp_buffer[0]));
+
+    // Factorize (kji-version):
+    for (vcl_size_t k=0; k < current_block_size; ++k)
+    {
+      NumericT a_kk = temp_buffer[col_start + k + k * A.internal_size1()];
+      for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i)
+        temp_buffer[i + k * A.internal_size1()] /= a_kk;  // write l_ik
+
+      for (vcl_size_t j=k+1; j < current_block_size; ++j)
+      {
+        NumericT a_kj = temp_buffer[col_start + k + j * A.internal_size1()];
+        for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i)
+          temp_buffer[i + j * A.internal_size1()] -= temp_buffer[i + k * A.internal_size1()] * a_kj;  // l_ik * a_kj
+      }
+    }
+
+    // Write back:
+    viennacl::backend::memory_write(A.handle(),
+                                    sizeof(NumericT) * col_start          * A.internal_size1(),
+                                    sizeof(NumericT) * current_block_size * A.internal_size1(),
+                                    &(temp_buffer[0]));
+
+    if (remainder_range.size() > 0)
+    {
+      //
+      // Compute U_12:
+      //
+      viennacl::matrix_range<MatrixType> L_11(A, block_range,     block_range);
+      viennacl::matrix_range<MatrixType> A_12(A, block_range, remainder_range);
+      viennacl::linalg::inplace_solve(L_11, A_12, viennacl::linalg::unit_lower_tag());
+
+      //
+      // Update remainder of A
+      //
+      viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range);
+      viennacl::matrix_range<MatrixType> U_12(A, block_range,     remainder_range);
+      viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range);
+
+      A_22 -= viennacl::linalg::prod(L_21, U_12);
+    }
+
+  }
+
+}
+
+
+//
+// Convenience layer:
+//
+
+/** @brief LU substitution for the system LU = rhs.
+*
+* @param A    The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+* @param B    The matrix of load vectors, where the solution is directly written to
+*/
+template<typename NumericT, typename F1, typename F2, unsigned int AlignmentV1, unsigned int AlignmentV2>
+void lu_substitute(matrix<NumericT, F1, AlignmentV1> const & A,
+                   matrix<NumericT, F2, AlignmentV2> & B)
+{
+  assert(A.size1() == A.size2() && bool("Matrix must be square"));
+  assert(A.size1() == B.size1() && bool("Matrix must be square"));
+  inplace_solve(A, B, unit_lower_tag());
+  inplace_solve(A, B, upper_tag());
+}
+
+/** @brief LU substitution for the system LU = rhs.
+*
+* @param A      The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written.
+* @param vec    The load vector, where the solution is directly written to
+*/
+template<typename NumericT, typename F, unsigned int MatAlignmentV, unsigned int VecAlignmentV>
+void lu_substitute(matrix<NumericT, F, MatAlignmentV> const & A,
+                   vector<NumericT, VecAlignmentV> & vec)
+{
+  assert(A.size1() == A.size2() && bool("Matrix must be square"));
+  inplace_solve(A, vec, unit_lower_tag());
+  inplace_solve(A, vec, upper_tag());
+}
+
+}
+}
+
+#endif