You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:23 UTC

[26/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
new file mode 100644
index 0000000..93b0cba
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/common.hpp
@@ -0,0 +1,263 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
+#define VIENNACL_LINALG_DETAIL_ILU_COMMON_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/common.hpp
+    @brief Common routines used within ILU-type preconditioners
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include <map>
+#include <list>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/misc_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+
+//
+// Level Scheduling Setup for ILU:
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_impl(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+                                 viennacl::vector<NumericT> const & diagonal_LU,
+                                 std::list<viennacl::backend::mem_handle> & row_index_arrays,
+                                 std::list<viennacl::backend::mem_handle> & row_buffers,
+                                 std::list<viennacl::backend::mem_handle> & col_buffers,
+                                 std::list<viennacl::backend::mem_handle> & element_buffers,
+                                 std::list<vcl_size_t> & row_elimination_num_list,
+                                 bool setup_U)
+{
+  NumericT     const * diagonal_buf = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(diagonal_LU.handle());
+  NumericT     const * elements     = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(LU.handle());
+  unsigned int const * row_buffer   = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle1());
+  unsigned int const * col_buffer   = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU.handle2());
+
+  //
+  // Step 1: Determine row elimination order for each row and build up meta information about the number of entries taking part in each elimination step:
+  //
+  std::vector<vcl_size_t> row_elimination(LU.size1());
+  std::map<vcl_size_t, std::map<vcl_size_t, vcl_size_t> > row_entries_per_elimination_step;
+
+  vcl_size_t max_elimination_runs = 0;
+  for (vcl_size_t row2 = 0; row2 < LU.size1(); ++row2)
+  {
+    vcl_size_t row = setup_U ? (LU.size1() - row2) - 1 : row2;
+
+    vcl_size_t row_begin = row_buffer[row];
+    vcl_size_t row_end   = row_buffer[row+1];
+    vcl_size_t elimination_index = 0;  //Note: first run corresponds to elimination_index = 1 (otherwise, type issues with int <-> unsigned int would arise
+    for (vcl_size_t i = row_begin; i < row_end; ++i)
+    {
+      unsigned int col = col_buffer[i];
+      if ( (!setup_U && col < row) || (setup_U && col > row) )
+      {
+        elimination_index = std::max<vcl_size_t>(elimination_index, row_elimination[col]);
+        row_entries_per_elimination_step[row_elimination[col]][row] += 1;
+      }
+    }
+    row_elimination[row] = elimination_index + 1;
+    max_elimination_runs = std::max<vcl_size_t>(max_elimination_runs, elimination_index + 1);
+  }
+
+  //std::cout << "Number of elimination runs: " << max_elimination_runs << std::endl;
+
+  //
+  // Step 2: Build row-major elimination matrix for each elimination step
+  //
+
+  //std::cout << "Elimination order: " << std::endl;
+  //for (vcl_size_t i=0; i<row_elimination.size(); ++i)
+  //  std::cout << row_elimination[i] << ", ";
+  //std::cout << std::endl;
+
+  //vcl_size_t summed_rows = 0;
+  for (vcl_size_t elimination_run = 1; elimination_run <= max_elimination_runs; ++elimination_run)
+  {
+    std::map<vcl_size_t, vcl_size_t> const & current_elimination_info = row_entries_per_elimination_step[elimination_run];
+
+    // count cols and entries handled in this elimination step
+    vcl_size_t num_tainted_cols = current_elimination_info.size();
+    vcl_size_t num_entries = 0;
+
+    for (std::map<vcl_size_t, vcl_size_t>::const_iterator it  = current_elimination_info.begin();
+                                                          it != current_elimination_info.end();
+                                                        ++it)
+      num_entries += it->second;
+
+    //std::cout << "num_entries: " << num_entries << std::endl;
+    //std::cout << "num_tainted_cols: " << num_tainted_cols << std::endl;
+
+    if (num_tainted_cols > 0)
+    {
+      row_index_arrays.push_back(viennacl::backend::mem_handle());
+      viennacl::backend::switch_memory_context<unsigned int>(row_index_arrays.back(), viennacl::traits::context(LU));
+      viennacl::backend::typesafe_host_array<unsigned int> elim_row_index_array(row_index_arrays.back(), num_tainted_cols);
+
+      row_buffers.push_back(viennacl::backend::mem_handle());
+      viennacl::backend::switch_memory_context<unsigned int>(row_buffers.back(), viennacl::traits::context(LU));
+      viennacl::backend::typesafe_host_array<unsigned int> elim_row_buffer(row_buffers.back(), num_tainted_cols + 1);
+
+      col_buffers.push_back(viennacl::backend::mem_handle());
+      viennacl::backend::switch_memory_context<unsigned int>(col_buffers.back(), viennacl::traits::context(LU));
+      viennacl::backend::typesafe_host_array<unsigned int> elim_col_buffer(col_buffers.back(), num_entries);
+
+      element_buffers.push_back(viennacl::backend::mem_handle());
+      viennacl::backend::switch_memory_context<NumericT>(element_buffers.back(), viennacl::traits::context(LU));
+      std::vector<NumericT> elim_elements_buffer(num_entries);
+
+      row_elimination_num_list.push_back(num_tainted_cols);
+
+      vcl_size_t k=0;
+      vcl_size_t nnz_index = 0;
+      elim_row_buffer.set(0, 0);
+
+      for (std::map<vcl_size_t, vcl_size_t>::const_iterator it  = current_elimination_info.begin();
+                                                              it != current_elimination_info.end();
+                                                            ++it)
+      {
+        //vcl_size_t col = setup_U ? (elimination_matrix.size() - it->first) - 1 : col2;
+        vcl_size_t row = it->first;
+        elim_row_index_array.set(k, row);
+
+        vcl_size_t row_begin = row_buffer[row];
+        vcl_size_t row_end   = row_buffer[row+1];
+        for (vcl_size_t i = row_begin; i < row_end; ++i)
+        {
+          unsigned int col = col_buffer[i];
+          if ( (!setup_U && col < row) || (setup_U && col > row) ) //entry of L/U
+          {
+            if (row_elimination[col] == elimination_run) // this entry is substituted in this run
+            {
+              elim_col_buffer.set(nnz_index, col);
+              elim_elements_buffer[nnz_index] = setup_U ? elements[i] / diagonal_buf[it->first] : elements[i];
+              ++nnz_index;
+            }
+          }
+        }
+
+        elim_row_buffer.set(++k, nnz_index);
+      }
+
+      //
+      // Wrap in memory_handles:
+      //
+      viennacl::backend::memory_create(row_index_arrays.back(), elim_row_index_array.raw_size(),                viennacl::traits::context(row_index_arrays.back()), elim_row_index_array.get());
+      viennacl::backend::memory_create(row_buffers.back(),      elim_row_buffer.raw_size(),                     viennacl::traits::context(row_buffers.back()),      elim_row_buffer.get());
+      viennacl::backend::memory_create(col_buffers.back(),      elim_col_buffer.raw_size(),                     viennacl::traits::context(col_buffers.back()),      elim_col_buffer.get());
+      viennacl::backend::memory_create(element_buffers.back(),  sizeof(NumericT) * elim_elements_buffer.size(), viennacl::traits::context(element_buffers.back()),  &(elim_elements_buffer[0]));
+    }
+
+    // Print some info:
+    //std::cout << "Eliminated columns in run " << elimination_run << ": " << num_tainted_cols << " (tainted columns: " << num_tainted_cols << ")" << std::endl;
+    //summed_rows += eliminated_rows_in_run;
+    //if (eliminated_rows_in_run == 0)
+    //  break;
+  }
+  //std::cout << "Eliminated rows: " << summed_rows << " out of " << row_elimination.size() << std::endl;
+}
+
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_L(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+                              viennacl::vector<NumericT> const & diagonal_LU,
+                              std::list<viennacl::backend::mem_handle> & row_index_arrays,
+                              std::list<viennacl::backend::mem_handle> & row_buffers,
+                              std::list<viennacl::backend::mem_handle> & col_buffers,
+                              std::list<viennacl::backend::mem_handle> & element_buffers,
+                              std::list<vcl_size_t> & row_elimination_num_list)
+{
+  level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, false);
+}
+
+
+//
+// Multifrontal setup of U:
+//
+
+template<typename NumericT, unsigned int AlignmentV>
+void level_scheduling_setup_U(viennacl::compressed_matrix<NumericT, AlignmentV> const & LU,
+                              viennacl::vector<NumericT> const & diagonal_LU,
+                              std::list<viennacl::backend::mem_handle> & row_index_arrays,
+                              std::list<viennacl::backend::mem_handle> & row_buffers,
+                              std::list<viennacl::backend::mem_handle> & col_buffers,
+                              std::list<viennacl::backend::mem_handle> & element_buffers,
+                              std::list<vcl_size_t> & row_elimination_num_list)
+{
+  level_scheduling_setup_impl(LU, diagonal_LU, row_index_arrays, row_buffers, col_buffers, element_buffers, row_elimination_num_list, true);
+}
+
+
+//
+// Multifrontal substitution (both L and U). Will partly be moved to single_threaded/opencl/cuda implementations
+//
+template<typename NumericT>
+void level_scheduling_substitute(viennacl::vector<NumericT> & vec,
+                                 std::list<viennacl::backend::mem_handle> const & row_index_arrays,
+                                 std::list<viennacl::backend::mem_handle> const & row_buffers,
+                                 std::list<viennacl::backend::mem_handle> const & col_buffers,
+                                 std::list<viennacl::backend::mem_handle> const & element_buffers,
+                                 std::list<vcl_size_t> const & row_elimination_num_list)
+{
+  typedef typename std::list< viennacl::backend::mem_handle >::const_iterator  ListIterator;
+  ListIterator row_index_array_it = row_index_arrays.begin();
+  ListIterator row_buffers_it = row_buffers.begin();
+  ListIterator col_buffers_it = col_buffers.begin();
+  ListIterator element_buffers_it = element_buffers.begin();
+  typename std::list< vcl_size_t>::const_iterator row_elimination_num_it = row_elimination_num_list.begin();
+  for (vcl_size_t i=0; i<row_index_arrays.size(); ++i)
+  {
+    viennacl::linalg::detail::level_scheduling_substitute(vec, *row_index_array_it, *row_buffers_it, *col_buffers_it, *element_buffers_it, *row_elimination_num_it);
+
+    ++row_index_array_it;
+    ++row_buffers_it;
+    ++col_buffers_it;
+    ++element_buffers_it;
+    ++row_elimination_num_it;
+  }
+}
+
+
+
+
+
+} // namespace detail
+} // namespace linalg
+} // namespace viennacl
+
+
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
new file mode 100644
index 0000000..1c3191a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilu0.hpp
@@ -0,0 +1,379 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILU0_HPP_
+#define VIENNACL_LINALG_DETAIL_ILU0_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/ilu0.hpp
+  @brief Implementations of incomplete factorization preconditioners with static nonzero pattern.
+
+  Contributed by Evan Bollig.
+
+  ILU0 (Incomplete LU with zero fill-in)
+  - All preconditioner nonzeros exist at locations that were nonzero in the input matrix.
+  - The number of nonzeros in the output preconditioner are exactly the same number as the input matrix
+
+ Evan Bollig 3/30/12
+
+ Adapted from viennacl/linalg/detail/ilut.hpp
+
+ Low-level reimplementation by Karl Rupp in Nov 2012, increasing performance substantially. Also added level-scheduling.
+
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/backend/memory.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU factorization with static pattern (ILU0)
+*/
+class ilu0_tag
+{
+public:
+  ilu0_tag(bool with_level_scheduling = false) : use_level_scheduling_(with_level_scheduling) {}
+
+  bool use_level_scheduling() const { return use_level_scheduling_; }
+  void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+
+private:
+  bool use_level_scheduling_;
+};
+
+
+/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+  *
+  * refer to the Algorithm in Saad's book (1996 edition)
+  *
+  *  @param A       The sparse matrix matrix. The result is directly written to A.
+  */
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> & A, ilu0_tag const & /* tag */)
+{
+  assert( (A.handle1().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+  assert( (A.handle2().get_active_handle_id() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+  assert( (A.handle().get_active_handle_id()  == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ILU0") );
+
+  NumericT           * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  // Note: Line numbers in the following refer to the algorithm in Saad's book
+
+  for (vcl_size_t i=1; i<A.size1(); ++i)  // Line 1
+  {
+    unsigned int row_i_begin = row_buffer[i];
+    unsigned int row_i_end   = row_buffer[i+1];
+    for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k) //Note: We do not assume that the column indices within a row are sorted
+    {
+      unsigned int k = col_buffer[buf_index_k];
+      if (k >= i)
+        continue; //Note: We do not assume that the column indices within a row are sorted
+
+      unsigned int row_k_begin = row_buffer[k];
+      unsigned int row_k_end   = row_buffer[k+1];
+
+      // get a_kk:
+      NumericT a_kk = 0;
+      for (unsigned int buf_index_akk = row_k_begin; buf_index_akk < row_k_end; ++buf_index_akk)
+      {
+        if (col_buffer[buf_index_akk] == k)
+        {
+          a_kk = elements[buf_index_akk];
+          break;
+        }
+      }
+
+      NumericT & a_ik = elements[buf_index_k];
+      a_ik /= a_kk;                                 //Line 3
+
+      for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j) //Note: We do not assume that the column indices within a row are sorted
+      {
+        unsigned int j = col_buffer[buf_index_j];
+        if (j <= k)
+          continue;
+
+        // determine a_kj:
+        NumericT a_kj = 0;
+        for (unsigned int buf_index_akj = row_k_begin; buf_index_akj < row_k_end; ++buf_index_akj)
+        {
+          if (col_buffer[buf_index_akj] == j)
+          {
+            a_kj = elements[buf_index_akj];
+            break;
+          }
+        }
+
+        //a_ij -= a_ik * a_kj
+        elements[buf_index_j] -= a_ik * a_kj;  //Line 5
+      }
+    }
+  }
+
+}
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ilu0_precond
+{
+  typedef typename MatrixT::value_type      NumericType;
+
+public:
+  ilu0_precond(MatrixT const & mat, ilu0_tag const & tag) : tag_(tag), LU_()
+  {
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_.handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LU_.handle2());
+    NumericType  const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LU_.handle());
+
+    viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LU_.size2(), unit_lower_tag());
+    viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LU_.size2(), upper_tag());
+  }
+
+private:
+  void init(MatrixT const & mat)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(LU_, host_context);
+
+    viennacl::copy(mat, LU_);
+    viennacl::linalg::precondition(LU_, tag_);
+  }
+
+  ilu0_tag                                   tag_;
+  viennacl::compressed_matrix<NumericType>   LU_;
+};
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ilu0_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+  typedef viennacl::compressed_matrix<NumericT, AlignmentV>   MatrixType;
+
+public:
+  ilu0_precond(MatrixType const & mat, ilu0_tag const & tag)
+    : tag_(tag),
+      LU_(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+  {
+    //initialize preconditioner:
+    //std::cout << "Start GPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End GPU precond" << std::endl;
+  }
+
+  void apply(viennacl::vector<NumericT> & vec) const
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+    {
+      if (tag_.use_level_scheduling())
+      {
+        //std::cout << "Using multifrontal on GPU..." << std::endl;
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_L_row_index_arrays_,
+                                            multifrontal_L_row_buffers_,
+                                            multifrontal_L_col_buffers_,
+                                            multifrontal_L_element_buffers_,
+                                            multifrontal_L_row_elimination_num_list_);
+
+        vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_U_row_index_arrays_,
+                                            multifrontal_U_row_buffers_,
+                                            multifrontal_U_col_buffers_,
+                                            multifrontal_U_element_buffers_,
+                                            multifrontal_U_row_elimination_num_list_);
+      }
+      else
+      {
+        viennacl::context old_context = viennacl::traits::context(vec);
+        viennacl::switch_memory_context(vec, host_context);
+        viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag());
+        viennacl::linalg::inplace_solve(LU_, vec, upper_tag());
+        viennacl::switch_memory_context(vec, old_context);
+      }
+    }
+    else //apply ILU0 directly on CPU
+    {
+      if (tag_.use_level_scheduling())
+      {
+        //std::cout << "Using multifrontal..." << std::endl;
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_L_row_index_arrays_,
+                                            multifrontal_L_row_buffers_,
+                                            multifrontal_L_col_buffers_,
+                                            multifrontal_L_element_buffers_,
+                                            multifrontal_L_row_elimination_num_list_);
+
+        vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_U_row_index_arrays_,
+                                            multifrontal_U_row_buffers_,
+                                            multifrontal_U_col_buffers_,
+                                            multifrontal_U_element_buffers_,
+                                            multifrontal_U_row_elimination_num_list_);
+      }
+      else
+      {
+        viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag());
+        viennacl::linalg::inplace_solve(LU_, vec, upper_tag());
+      }
+    }
+  }
+
+  vcl_size_t levels() const { return multifrontal_L_row_index_arrays_.size(); }
+
+private:
+  void init(MatrixType const & mat)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(LU_, host_context);
+    LU_ = mat;
+    viennacl::linalg::precondition(LU_, tag_);
+
+    if (!tag_.use_level_scheduling())
+      return;
+
+    // multifrontal part:
+    viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+    multifrontal_U_diagonal_.resize(LU_.size1(), false);
+    host_based::detail::row_info(LU_, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+    detail::level_scheduling_setup_L(LU_,
+                                     multifrontal_U_diagonal_, //dummy
+                                     multifrontal_L_row_index_arrays_,
+                                     multifrontal_L_row_buffers_,
+                                     multifrontal_L_col_buffers_,
+                                     multifrontal_L_element_buffers_,
+                                     multifrontal_L_row_elimination_num_list_);
+
+
+    detail::level_scheduling_setup_U(LU_,
+                                     multifrontal_U_diagonal_,
+                                     multifrontal_U_row_index_arrays_,
+                                     multifrontal_U_row_buffers_,
+                                     multifrontal_U_col_buffers_,
+                                     multifrontal_U_element_buffers_,
+                                     multifrontal_U_row_elimination_num_list_);
+
+    //
+    // Bring to device if necessary:
+    //
+
+    // L:
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+                                                                       it != multifrontal_L_row_index_arrays_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+                                                                       it != multifrontal_L_row_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+                                                                       it != multifrontal_L_col_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+                                                                       it != multifrontal_L_element_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+    // U:
+
+    viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+                                                                       it != multifrontal_U_row_index_arrays_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+                                                                       it != multifrontal_U_row_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+                                                                       it != multifrontal_U_col_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+                                                                       it != multifrontal_U_element_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+  }
+
+  ilu0_tag tag_;
+  viennacl::compressed_matrix<NumericT> LU_;
+
+  std::list<viennacl::backend::mem_handle> multifrontal_L_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_row_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_col_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_element_buffers_;
+  std::list<vcl_size_t>                    multifrontal_L_row_elimination_num_list_;
+
+  viennacl::vector<NumericT> multifrontal_U_diagonal_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_row_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_col_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_element_buffers_;
+  std::list<vcl_size_t>                    multifrontal_U_row_elimination_num_list_;
+
+};
+
+} // namespace linalg
+} // namespace viennacl
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
new file mode 100644
index 0000000..11ab842
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/ilu/ilut.hpp
@@ -0,0 +1,597 @@
+#ifndef VIENNACL_LINALG_DETAIL_ILUT_HPP_
+#define VIENNACL_LINALG_DETAIL_ILUT_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/ilu/ilut.hpp
+    @brief Implementations of an incomplete factorization preconditioner with threshold (ILUT)
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+
+#include "viennacl/linalg/detail/ilu/common.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete LU factorization with threshold (ILUT)
+*/
+class ilut_tag
+{
+  public:
+    /** @brief The constructor.
+    *
+    * @param entries_per_row        Number of nonzero entries per row in L and U. Note that L and U are stored in a single matrix, thus there are 2*entries_per_row in total.
+    * @param drop_tolerance         The drop tolerance for ILUT
+    * @param with_level_scheduling  Flag for enabling level scheduling on GPUs.
+    */
+    ilut_tag(unsigned int entries_per_row = 20,
+             double       drop_tolerance = 1e-4,
+             bool         with_level_scheduling = false)
+      : entries_per_row_(entries_per_row),
+        drop_tolerance_(drop_tolerance),
+        use_level_scheduling_(with_level_scheduling) {}
+
+    void set_drop_tolerance(double tol)
+    {
+      if (tol > 0)
+        drop_tolerance_ = tol;
+    }
+    double get_drop_tolerance() const { return drop_tolerance_; }
+
+    void set_entries_per_row(unsigned int e)
+    {
+      if (e > 0)
+        entries_per_row_ = e;
+    }
+
+    unsigned int get_entries_per_row() const { return entries_per_row_; }
+
+    bool use_level_scheduling() const { return use_level_scheduling_; }
+    void use_level_scheduling(bool b) { use_level_scheduling_ = b; }
+
+  private:
+    unsigned int entries_per_row_;
+    double       drop_tolerance_;
+    bool         use_level_scheduling_;
+};
+
+
+namespace detail
+{
+  /** @brief Helper struct for holding a sparse vector in linear memory. For internal use only.
+    *
+    * Unfortunately, the 'naive' implementation using a std::map<> is almost always too slow.
+    *
+    */
+  template<typename NumericT>
+  struct ilut_sparse_vector
+  {
+    ilut_sparse_vector(vcl_size_t alloc_size = 0) : size_(0), col_indices_(alloc_size), elements_(alloc_size) {}
+
+    void resize_if_bigger(vcl_size_t s)
+    {
+      if (s > elements_.size())
+      {
+        col_indices_.resize(s);
+        elements_.resize(s);
+      }
+      size_ = s;
+    }
+
+    vcl_size_t size_;
+    std::vector<unsigned int> col_indices_;
+    std::vector<NumericT>     elements_;
+  };
+
+  /** @brief Subtracts a scaled sparse vector u from a sparse vector w and writes the output to z: z = w - alpha * u
+    *
+    * Sparsity pattern of u and w are usually different.
+    *
+    * @return Length of new vector
+    */
+  template<typename IndexT, typename NumericT>
+  IndexT merge_subtract_sparse_rows(IndexT const * w_coords, NumericT const * w_elements, IndexT w_size,
+                                    IndexT const * u_coords, NumericT const * u_elements, IndexT u_size, NumericT alpha,
+                                    IndexT       * z_coords, NumericT       * z_elements)
+  {
+    IndexT index_w = 0;
+    IndexT index_u = 0;
+    IndexT index_z = 0;
+
+    while (1)
+    {
+      if (index_w < w_size && index_u < u_size)
+      {
+        if (w_coords[index_w] < u_coords[index_u])
+        {
+          z_coords[index_z]     = w_coords[index_w];
+          z_elements[index_z++] = w_elements[index_w++];
+        }
+        else if (w_coords[index_w] == u_coords[index_u])
+        {
+          z_coords[index_z]     = w_coords[index_w];
+          z_elements[index_z++] = w_elements[index_w++] - alpha * u_elements[index_u++];
+        }
+        else
+        {
+          z_coords[index_z]     = u_coords[index_u];
+          z_elements[index_z++] = - alpha * u_elements[index_u++];
+        }
+      }
+      else if (index_w == w_size && index_u < u_size)
+      {
+        z_coords[index_z]     = u_coords[index_u];
+        z_elements[index_z++] = - alpha * u_elements[index_u++];
+      }
+      else if (index_w < w_size && index_u == u_size)
+      {
+        z_coords[index_z]     = w_coords[index_w];
+        z_elements[index_z++] = w_elements[index_w++];
+      }
+      else
+        return index_z;
+    }
+  }
+
+  template<typename SizeT, typename NumericT>
+  void insert_with_value_sort(std::vector<std::pair<SizeT, NumericT> > & map,
+                              SizeT index, NumericT value)
+  {
+    NumericT abs_value = std::fabs(value);
+    if (abs_value > 0)
+    {
+      // find first element with smaller absolute value:
+      std::size_t first_smaller_index = 0;
+      while (first_smaller_index < map.size() && std::fabs(map[first_smaller_index].second) > abs_value)
+        ++first_smaller_index;
+
+      std::pair<SizeT, NumericT> tmp(index, value);
+      for (std::size_t j=first_smaller_index; j<map.size(); ++j)
+        std::swap(map[j], tmp);
+    }
+  }
+
+}
+
+/** @brief Implementation of a ILU-preconditioner with threshold. Optimized implementation for compressed_matrix.
+*
+* refer to Algorithm 10.6 by Saad's book (1996 edition)
+*
+*  @param A       The input matrix. Either a compressed_matrix or of type std::vector< std::map<T, U> >
+*  @param L       The output matrix for L.
+*  @param U       The output matrix for U.
+*  @param tag     An ilut_tag in order to dispatch among several other preconditioners.
+*/
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> const & A,
+                  viennacl::compressed_matrix<NumericT>       & L,
+                  viennacl::compressed_matrix<NumericT>       & U,
+                  ilut_tag const & tag)
+{
+  assert(A.size1() == L.size1() && bool("Output matrix size mismatch") );
+  assert(A.size1() == U.size1() && bool("Output matrix size mismatch") );
+
+  L.reserve( tag.get_entries_per_row()      * A.size1());
+  U.reserve((tag.get_entries_per_row() + 1) * A.size1());
+
+  vcl_size_t avg_nnz_per_row = static_cast<vcl_size_t>(A.nnz() / A.size1());
+  detail::ilut_sparse_vector<NumericT> w1(tag.get_entries_per_row() * (avg_nnz_per_row + 10));
+  detail::ilut_sparse_vector<NumericT> w2(tag.get_entries_per_row() * (avg_nnz_per_row + 10));
+  detail::ilut_sparse_vector<NumericT> * w_in  = &w1;
+  detail::ilut_sparse_vector<NumericT> * w_out = &w2;
+  std::vector<NumericT> diagonal_U(A.size1());
+
+  NumericT     const * elements_A   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * row_buffer_A = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * col_buffer_A = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  NumericT           * elements_L   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(L.handle());
+  unsigned int       * row_buffer_L = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.handle1()); row_buffer_L[0] = 0;
+  unsigned int       * col_buffer_L = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L.handle2());
+
+  NumericT           * elements_U   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(U.handle());
+  unsigned int       * row_buffer_U = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.handle1()); row_buffer_U[0] = 0;
+  unsigned int       * col_buffer_U = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U.handle2());
+
+  std::vector<std::pair<unsigned int, NumericT> > sorted_entries_L(tag.get_entries_per_row());
+  std::vector<std::pair<unsigned int, NumericT> > sorted_entries_U(tag.get_entries_per_row());
+
+  for (vcl_size_t i=0; i<viennacl::traits::size1(A); ++i)  // Line 1
+  {
+    std::fill(sorted_entries_L.begin(), sorted_entries_L.end(), std::pair<unsigned int, NumericT>(0, NumericT(0)));
+    std::fill(sorted_entries_U.begin(), sorted_entries_U.end(), std::pair<unsigned int, NumericT>(0, NumericT(0)));
+
+    //line 2: set up w
+    w_in->resize_if_bigger(row_buffer_A[i+1] - row_buffer_A[i]);
+    NumericT row_norm = 0;
+    unsigned int k = 0;
+    for (unsigned int j = row_buffer_A[i]; j < row_buffer_A[i+1]; ++j, ++k)
+    {
+      w_in->col_indices_[k] = col_buffer_A[j];
+      NumericT entry = elements_A[j];
+      w_in->elements_[k] = entry;
+      row_norm += entry * entry;
+    }
+    row_norm = std::sqrt(row_norm);
+    NumericT tau_i = static_cast<NumericT>(tag.get_drop_tolerance()) * row_norm;
+
+    //line 3: Iterate over lower diagonal parts of A:
+    k = 0;
+    unsigned int current_col = (row_buffer_A[i+1] > row_buffer_A[i]) ? w_in->col_indices_[k] : static_cast<unsigned int>(i); // mind empty rows here!
+    while (current_col < i)
+    {
+      //line 4:
+      NumericT a_kk = diagonal_U[current_col];
+
+      NumericT w_k_entry = w_in->elements_[k] / a_kk;
+      w_in->elements_[k] = w_k_entry;
+
+      //lines 5,6: (dropping rule to w_k)
+      if ( std::fabs(w_k_entry) > tau_i)
+      {
+        //line 7:
+        unsigned int row_U_begin = row_buffer_U[current_col];
+        unsigned int row_U_end   = row_buffer_U[current_col + 1];
+
+        if (row_U_end > row_U_begin)
+        {
+          w_out->resize_if_bigger(w_in->size_ + (row_U_end - row_U_begin) - 1);
+          w_out->size_ = detail::merge_subtract_sparse_rows(&(w_in->col_indices_[0]), &(w_in->elements_[0]), static_cast<unsigned int>(w_in->size_),
+                                                            col_buffer_U + row_U_begin + 1, elements_U + row_U_begin + 1, (row_U_end - row_U_begin) - 1, w_k_entry,
+                                                            &(w_out->col_indices_[0]), &(w_out->elements_[0])
+                                                           );
+          ++k;
+        }
+      }
+      else // drop element
+      {
+        w_out->resize_if_bigger(w_in->size_ - 1);
+        for (unsigned int r = 0; r < k; ++r)
+        {
+          w_out->col_indices_[r] = w_in->col_indices_[r];
+          w_out->elements_[r]    = w_in->elements_[r];
+        }
+        for (unsigned int r = k+1; r < w_in->size_; ++r)
+        {
+          w_out->col_indices_[r-1] = w_in->col_indices_[r];
+          w_out->elements_[r-1]    = w_in->elements_[r];
+        }
+
+        // Note: No increment to k here, element was dropped!
+      }
+
+      // swap pointers to w1 and w2
+      std::swap(w_in, w_out);
+
+      // process next entry:
+      current_col = (k < w_in->size_) ? w_in->col_indices_[k] : static_cast<unsigned int>(i);
+    } // while()
+
+    // Line 10: Apply a dropping rule to w
+    // To do so, we write values to a temporary array
+    for (unsigned int r = 0; r < w_in->size_; ++r)
+    {
+      unsigned int col   = w_in->col_indices_[r];
+      NumericT     value = w_in->elements_[r];
+
+      if (col < i) // entry for L:
+        detail::insert_with_value_sort(sorted_entries_L, col, value);
+      else if (col == i) // do not drop diagonal element
+      {
+        diagonal_U[i] = value;
+        if (value <= 0 && value >= 0)
+        {
+          std::cerr << "ViennaCL: FATAL ERROR in ILUT(): Diagonal entry computed to zero (" << value << ") in row " << i << "!" << std::endl;
+          throw zero_on_diagonal_exception("ILUT zero diagonal!");
+        }
+      }
+      else // entry for U:
+        detail::insert_with_value_sort(sorted_entries_U, col, value);
+    }
+
+    //Lines 10-12: Apply a dropping rule to w, write the largest p values to L and U
+    unsigned int offset_L = row_buffer_L[i];
+    std::sort(sorted_entries_L.begin(), sorted_entries_L.end());
+    for (unsigned int j=0; j<tag.get_entries_per_row(); ++j)
+      if (std::fabs(sorted_entries_L[j].second) > 0)
+      {
+        col_buffer_L[offset_L] = sorted_entries_L[j].first;
+        elements_L[offset_L]   = sorted_entries_L[j].second;
+        ++offset_L;
+      }
+    row_buffer_L[i+1] = offset_L;
+
+    unsigned int offset_U = row_buffer_U[i];
+    col_buffer_U[offset_U] = static_cast<unsigned int>(i);
+    elements_U[offset_U]   = diagonal_U[i];
+    ++offset_U;
+    std::sort(sorted_entries_U.begin(), sorted_entries_U.end());
+    for (unsigned int j=0; j<tag.get_entries_per_row(); ++j)
+      if (std::fabs(sorted_entries_U[j].second) > 0)
+      {
+        col_buffer_U[offset_U] = sorted_entries_U[j].first;
+        elements_U[offset_U]   = sorted_entries_U[j].second;
+        ++offset_U;
+      }
+    row_buffer_U[i+1] = offset_U;
+
+  } //for i
+}
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ilut_precond
+{
+  typedef typename MatrixT::value_type      NumericType;
+
+public:
+  ilut_precond(MatrixT const & mat, ilut_tag const & tag) : tag_(tag), L_(mat.size1(), mat.size2()), U_(mat.size1(), mat.size2())
+  {
+    //initialize preconditioner:
+    //std::cout << "Start CPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End CPU precond" << std::endl;
+  }
+
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    //Note: Since vec can be a rather arbitrary vector type, we call the more generic version in the backend manually:
+    {
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_.handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(L_.handle2());
+      NumericType  const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(L_.handle());
+
+      viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, L_.size2(), unit_lower_tag());
+    }
+    {
+      unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_.handle1());
+      unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(U_.handle2());
+      NumericType  const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(U_.handle());
+
+      viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, U_.size2(), upper_tag());
+    }
+  }
+
+private:
+  void init(MatrixT const & mat)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::compressed_matrix<NumericType> temp;
+    viennacl::switch_memory_context(temp, host_context);
+    viennacl::switch_memory_context(L_, host_context);
+    viennacl::switch_memory_context(U_, host_context);
+
+    viennacl::copy(mat, temp);
+
+    viennacl::linalg::precondition(temp, L_, U_, tag_);
+  }
+
+  ilut_tag tag_;
+  viennacl::compressed_matrix<NumericType> L_;
+  viennacl::compressed_matrix<NumericType> U_;
+};
+
+
+/** @brief ILUT preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ilut_precond< viennacl::compressed_matrix<NumericT, AlignmentV> >
+{
+typedef viennacl::compressed_matrix<NumericT, AlignmentV>   MatrixType;
+
+public:
+  ilut_precond(MatrixType const & mat, ilut_tag const & tag)
+    : tag_(tag),
+      L_(mat.size1(), mat.size2(), viennacl::traits::context(mat)),
+      U_(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+  {
+    //initialize preconditioner:
+    //std::cout << "Start GPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End GPU precond" << std::endl;
+  }
+
+  void apply(viennacl::vector<NumericT> & vec) const
+  {
+    if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
+    {
+      if (tag_.use_level_scheduling())
+      {
+        //std::cout << "Using multifrontal on GPU..." << std::endl;
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_L_row_index_arrays_,
+                                            multifrontal_L_row_buffers_,
+                                            multifrontal_L_col_buffers_,
+                                            multifrontal_L_element_buffers_,
+                                            multifrontal_L_row_elimination_num_list_);
+
+        vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);
+
+        detail::level_scheduling_substitute(vec,
+                                            multifrontal_U_row_index_arrays_,
+                                            multifrontal_U_row_buffers_,
+                                            multifrontal_U_col_buffers_,
+                                            multifrontal_U_element_buffers_,
+                                            multifrontal_U_row_elimination_num_list_);
+
+      }
+      else
+      {
+        viennacl::context host_context(viennacl::MAIN_MEMORY);
+        viennacl::context old_context = viennacl::traits::context(vec);
+        viennacl::switch_memory_context(vec, host_context);
+        viennacl::linalg::inplace_solve(L_, vec, unit_lower_tag());
+        viennacl::linalg::inplace_solve(U_, vec, upper_tag());
+        viennacl::switch_memory_context(vec, old_context);
+      }
+    }
+    else //apply ILUT directly:
+    {
+      viennacl::linalg::inplace_solve(L_, vec, unit_lower_tag());
+      viennacl::linalg::inplace_solve(U_, vec, upper_tag());
+    }
+  }
+
+private:
+  void init(MatrixType const & mat)
+  {
+    viennacl::context host_context(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(L_, host_context);
+    viennacl::switch_memory_context(U_, host_context);
+
+    if (viennacl::traits::context(mat).memory_type() == viennacl::MAIN_MEMORY)
+    {
+      viennacl::linalg::precondition(mat, L_, U_, tag_);
+    }
+    else //we need to copy to CPU
+    {
+      viennacl::compressed_matrix<NumericT> cpu_mat(mat.size1(), mat.size2(), viennacl::traits::context(mat));
+      viennacl::switch_memory_context(cpu_mat, host_context);
+
+      cpu_mat = mat;
+
+      viennacl::linalg::precondition(cpu_mat, L_, U_, tag_);
+    }
+
+    if (!tag_.use_level_scheduling())
+      return;
+
+    //
+    // multifrontal part:
+    //
+
+    viennacl::switch_memory_context(multifrontal_U_diagonal_, host_context);
+    multifrontal_U_diagonal_.resize(U_.size1(), false);
+    host_based::detail::row_info(U_, multifrontal_U_diagonal_, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
+
+    detail::level_scheduling_setup_L(L_,
+                                     multifrontal_U_diagonal_, //dummy
+                                     multifrontal_L_row_index_arrays_,
+                                     multifrontal_L_row_buffers_,
+                                     multifrontal_L_col_buffers_,
+                                     multifrontal_L_element_buffers_,
+                                     multifrontal_L_row_elimination_num_list_);
+
+
+    detail::level_scheduling_setup_U(U_,
+                                     multifrontal_U_diagonal_,
+                                     multifrontal_U_row_index_arrays_,
+                                     multifrontal_U_row_buffers_,
+                                     multifrontal_U_col_buffers_,
+                                     multifrontal_U_element_buffers_,
+                                     multifrontal_U_row_elimination_num_list_);
+
+    //
+    // Bring to device if necessary:
+    //
+
+    // L:
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_index_arrays_.begin();
+                                                                       it != multifrontal_L_row_index_arrays_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_row_buffers_.begin();
+                                                                       it != multifrontal_L_row_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_col_buffers_.begin();
+                                                                       it != multifrontal_L_col_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_L_element_buffers_.begin();
+                                                                       it != multifrontal_L_element_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+    // U:
+
+    viennacl::switch_memory_context(multifrontal_U_diagonal_, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_index_arrays_.begin();
+                                                                       it != multifrontal_U_row_index_arrays_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_row_buffers_.begin();
+                                                                       it != multifrontal_U_row_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_col_buffers_.begin();
+                                                                       it != multifrontal_U_col_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<unsigned int>(*it, viennacl::traits::context(mat));
+
+    for (typename std::list< viennacl::backend::mem_handle >::iterator it  = multifrontal_U_element_buffers_.begin();
+                                                                       it != multifrontal_U_element_buffers_.end();
+                                                                     ++it)
+      viennacl::backend::switch_memory_context<NumericT>(*it, viennacl::traits::context(mat));
+
+
+  }
+
+  ilut_tag tag_;
+  viennacl::compressed_matrix<NumericT> L_;
+  viennacl::compressed_matrix<NumericT> U_;
+
+  std::list<viennacl::backend::mem_handle> multifrontal_L_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_row_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_col_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_L_element_buffers_;
+  std::list<vcl_size_t > multifrontal_L_row_elimination_num_list_;
+
+  viennacl::vector<NumericT> multifrontal_U_diagonal_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_row_index_arrays_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_row_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_col_buffers_;
+  std::list<viennacl::backend::mem_handle> multifrontal_U_element_buffers_;
+  std::list<vcl_size_t > multifrontal_U_row_elimination_num_list_;
+};
+
+} // namespace linalg
+} // namespace viennacl
+
+
+
+
+#endif
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
new file mode 100644
index 0000000..0e2abb0
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_applier.hpp
@@ -0,0 +1,103 @@
+#ifndef VIENNACL_LINALG_DETAIL_OP_APPLIER_HPP
+#define VIENNACL_LINALG_DETAIL_OP_APPLIER_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/op_applier.hpp
+ *
+ * @brief Defines the action of certain unary and binary operators and its arguments (for host execution).
+*/
+
+#include "viennacl/forwards.h"
+#include <cmath>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+/** @brief Worker class for decomposing expression templates.
+  *
+  * @tparam A    Type to which is assigned to
+  * @tparam OP   One out of {op_assign, op_inplace_add, op_inplace_sub}
+  @ @tparam T    Right hand side of the assignment
+*/
+template<typename OpT>
+struct op_applier
+{
+  typedef typename OpT::ERROR_UNKNOWN_OP_TAG_PROVIDED    error_type;
+};
+
+/** \cond */
+template<>
+struct op_applier<op_element_binary<op_prod> >
+{
+  template<typename T>
+  static void apply(T & result, T const & x, T const & y) { result = x * y; }
+};
+
+template<>
+struct op_applier<op_element_binary<op_div> >
+{
+  template<typename T>
+  static void apply(T & result, T const & x, T const & y) { result = x / y; }
+};
+
+template<>
+struct op_applier<op_element_binary<op_pow> >
+{
+  template<typename T>
+  static void apply(T & result, T const & x, T const & y) { result = std::pow(x, y); }
+};
+
+#define VIENNACL_MAKE_UNARY_OP_APPLIER(funcname)  \
+template<> \
+struct op_applier<op_element_unary<op_##funcname> > \
+{ \
+  template<typename T> \
+  static void apply(T & result, T const & x) { using namespace std; result = funcname(x); } \
+}
+
+VIENNACL_MAKE_UNARY_OP_APPLIER(abs);
+VIENNACL_MAKE_UNARY_OP_APPLIER(acos);
+VIENNACL_MAKE_UNARY_OP_APPLIER(asin);
+VIENNACL_MAKE_UNARY_OP_APPLIER(atan);
+VIENNACL_MAKE_UNARY_OP_APPLIER(ceil);
+VIENNACL_MAKE_UNARY_OP_APPLIER(cos);
+VIENNACL_MAKE_UNARY_OP_APPLIER(cosh);
+VIENNACL_MAKE_UNARY_OP_APPLIER(exp);
+VIENNACL_MAKE_UNARY_OP_APPLIER(fabs);
+VIENNACL_MAKE_UNARY_OP_APPLIER(floor);
+VIENNACL_MAKE_UNARY_OP_APPLIER(log);
+VIENNACL_MAKE_UNARY_OP_APPLIER(log10);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sin);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sinh);
+VIENNACL_MAKE_UNARY_OP_APPLIER(sqrt);
+VIENNACL_MAKE_UNARY_OP_APPLIER(tan);
+VIENNACL_MAKE_UNARY_OP_APPLIER(tanh);
+
+#undef VIENNACL_MAKE_UNARY_OP_APPLIER
+/** \endcond */
+
+}
+}
+}
+
+#endif // VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
new file mode 100644
index 0000000..bd49b3b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/op_executor.hpp
@@ -0,0 +1,86 @@
+#ifndef VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
+#define VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/detail/op_executor.hpp
+ *
+ * @brief Defines the worker class for decomposing an expression tree into small chunks, which can be processed by the predefined operations in ViennaCL.
+*/
+
+#include "viennacl/forwards.h"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+
+template<typename NumericT, typename B>
+bool op_aliasing(vector_base<NumericT> const & /*lhs*/, B const & /*b*/)
+{
+  return false;
+}
+
+template<typename NumericT>
+bool op_aliasing(vector_base<NumericT> const & lhs, vector_base<NumericT> const & b)
+{
+  return lhs.handle() == b.handle();
+}
+
+template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
+bool op_aliasing(vector_base<NumericT> const & lhs, vector_expression<const LhsT, const RhsT, OpT> const & rhs)
+{
+  return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
+}
+
+
+template<typename NumericT, typename B>
+bool op_aliasing(matrix_base<NumericT> const & /*lhs*/, B const & /*b*/)
+{
+  return false;
+}
+
+template<typename NumericT>
+bool op_aliasing(matrix_base<NumericT> const & lhs, matrix_base<NumericT> const & b)
+{
+  return lhs.handle() == b.handle();
+}
+
+template<typename NumericT, typename LhsT, typename RhsT, typename OpT>
+bool op_aliasing(matrix_base<NumericT> const & lhs, matrix_expression<const LhsT, const RhsT, OpT> const & rhs)
+{
+  return op_aliasing(lhs, rhs.lhs()) || op_aliasing(lhs, rhs.rhs());
+}
+
+
+/** @brief Worker class for decomposing expression templates.
+  *
+  * @tparam A    Type to which is assigned to
+  * @tparam OP   One out of {op_assign, op_inplace_add, op_inplace_sub}
+  @ @tparam T    Right hand side of the assignment
+*/
+template<typename A, typename OP, typename T>
+struct op_executor {};
+
+}
+}
+}
+
+#endif // VIENNACL_LINALG_DETAIL_OP_EXECUTOR_HPP

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
new file mode 100644
index 0000000..12ff77b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_matrix.hpp
@@ -0,0 +1,86 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_BLOCK_MATRIX_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/tools/tools.hpp"
+
+/** @file viennacl/linalg/detail/spai/block_matrix.hpp
+    @brief Implementation of a bunch of (small) matrices on GPU. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+* @brief Represents contigious matrices on GPU
+*/
+
+class block_matrix
+{
+public:
+
+  ////////// non-const
+
+  /** @brief Returns a handle to the elements */
+  viennacl::ocl::handle<cl_mem>& handle(){ return elements_; }
+
+  /** @brief Returns a handle to the matrix dimensions */
+  viennacl::ocl::handle<cl_mem>& handle1() { return matrix_dimensions_; }
+
+  /** @brief Returns a handle to the start indices of matrix */
+  viennacl::ocl::handle<cl_mem>& handle2() { return start_block_inds_; }
+
+  ////////// const
+
+  /** @brief Returns a handle to the const elements */
+  const viennacl::ocl::handle<cl_mem>& handle() const { return elements_; }
+
+  /** @brief Returns a handle to the const matrix dimensions */
+  const viennacl::ocl::handle<cl_mem>& handle1() const { return matrix_dimensions_; }
+
+  /** @brief Returns a handle to the const start indices of matrix */
+  const viennacl::ocl::handle<cl_mem>& handle2() const { return start_block_inds_; }
+
+private:
+  viennacl::ocl::handle<cl_mem> elements_;
+  viennacl::ocl::handle<cl_mem> matrix_dimensions_;
+  viennacl::ocl::handle<cl_mem> start_block_inds_;
+};
+
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
new file mode 100644
index 0000000..eee6aef
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/block_vector.hpp
@@ -0,0 +1,77 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_BLOCK_VECTOR_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_BLOCK_VECTOR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/tools/tools.hpp"
+
+/** @file viennacl/linalg/detail/spai/block_vector.hpp
+    @brief Implementation of a bunch of vectors on GPU. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+* @brief Represents a contiguous vector on the GPU to represent a concatentation of small vectors
+*/
+class block_vector
+{
+public:
+
+  ///////////// non-const
+
+  /** @brief Return handle to the elements */
+  viennacl::ocl::handle<cl_mem> & handle(){ return elements_; }
+
+  /** @brief Return handle to start indices */
+  viennacl::ocl::handle<cl_mem> & handle1() { return start_block_inds_; }
+
+  ///////////// const
+
+  /** @brief Return handle to the const elements */
+  const viennacl::ocl::handle<cl_mem> & handle() const { return elements_; }
+
+  /** @brief Return handle to const start indices */
+  const viennacl::ocl::handle<cl_mem> & handle1() const { return start_block_inds_; }
+
+private:
+  viennacl::ocl::handle<cl_mem> elements_;
+  viennacl::ocl::handle<cl_mem> start_block_inds_;
+};
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
new file mode 100644
index 0000000..fab81d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/fspai.hpp
@@ -0,0 +1,402 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_FSPAI_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_FSPAI_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+//boost includes
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/cg.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+//#include <omp.h>
+
+/** @file viennacl/linalg/detail/spai/fspai.hpp
+    @brief Implementation of FSPAI. Experimental.
+*/
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief A tag for FSPAI. Experimental.
+*
+* Contains values for the algorithm.
+* Must be passed to spai_precond constructor
+*/
+class fspai_tag
+{
+public:
+  /** @brief Constructor
+   *
+   * @param residual_norm_threshold Calculate until the norm of the residual falls below this threshold
+   * @param iteration_limit maximum number of iterations
+   * @param is_static determines if static version of SPAI should be used
+   * @param is_right determines if left or right preconditioner should be used
+   */
+  fspai_tag(
+          double residual_norm_threshold = 1e-3,
+          unsigned int iteration_limit = 5,
+          bool is_static = false,
+          bool is_right = false)
+    : residual_norm_threshold_(residual_norm_threshold),
+      iteration_limit_(iteration_limit),
+      is_static_(is_static),
+      is_right_(is_right) {}
+
+  inline double getResidualNormThreshold() const { return residual_norm_threshold_; }
+  inline unsigned long getIterationLimit () const { return iteration_limit_; }
+  inline bool getIsStatic() const { return is_static_; }
+  inline bool getIsRight() const  { return is_right_; }
+  inline void setResidualNormThreshold(double residual_norm_threshold)
+  {
+    if (residual_norm_threshold > 0)
+      residual_norm_threshold_ = residual_norm_threshold;
+  }
+  inline void setIterationLimit(unsigned long iteration_limit)
+  {
+    if (iteration_limit > 0)
+      iteration_limit_ = iteration_limit;
+  }
+  inline void setIsRight(bool is_right)   { is_right_  = is_right; }
+  inline void setIsStatic(bool is_static) { is_static_ = is_static; }
+
+private:
+  double residual_norm_threshold_;
+  unsigned long iteration_limit_;
+  bool is_static_;
+  bool is_right_;
+};
+
+
+//
+// Helper: Store A in an STL container of type, exploiting symmetry
+// Reason: ublas interface does not allow to iterate over nonzeros of a particular row without starting an iterator1 from the very beginning of the matrix...
+//
+template<typename MatrixT, typename NumericT>
+void sym_sparse_matrix_to_stl(MatrixT const & A, std::vector<std::map<unsigned int, NumericT> > & STL_A)
+{
+  STL_A.resize(A.size1());
+  for (typename MatrixT::const_iterator1 row_it  = A.begin1();
+                                         row_it != A.end1();
+                                       ++row_it)
+  {
+    for (typename MatrixT::const_iterator2 col_it  = row_it.begin();
+                                           col_it != row_it.end();
+                                         ++col_it)
+    {
+      if (col_it.index1() >= col_it.index2())
+        STL_A[col_it.index1()][static_cast<unsigned int>(col_it.index2())] = *col_it;
+      else
+        break; //go to next row
+    }
+  }
+}
+
+
+//
+// Generate index sets J_k, k=0,...,N-1
+//
+template<typename MatrixT>
+void generateJ(MatrixT const & A, std::vector<std::vector<vcl_size_t> > & J)
+{
+  for (typename MatrixT::const_iterator1 row_it  = A.begin1();
+                                         row_it != A.end1();
+                                       ++row_it)
+  {
+    for (typename MatrixT::const_iterator2 col_it  = row_it.begin();
+                                           col_it != row_it.end();
+                                         ++col_it)
+    {
+      if (col_it.index1() > col_it.index2()) //Matrix is symmetric, thus only work on lower triangular part
+      {
+        J[col_it.index2()].push_back(col_it.index1());
+        J[col_it.index1()].push_back(col_it.index2());
+      }
+      else
+        break; //go to next row
+    }
+  }
+}
+
+
+//
+// Extracts the blocks A(\tilde{J}_k, \tilde{J}_k) from A
+// Sets up y_k = A(\tilde{J}_k, k) for the inplace-solution after Cholesky-factoriation
+//
+template<typename NumericT, typename MatrixT, typename VectorT>
+void fill_blocks(std::vector< std::map<unsigned int, NumericT> > & A,
+                 std::vector<MatrixT>                            & blocks,
+                 std::vector<std::vector<vcl_size_t> > const     & J,
+                 std::vector<VectorT>                            & Y)
+{
+  for (vcl_size_t k=0; k<A.size(); ++k)
+  {
+    std::vector<vcl_size_t> const & Jk = J[k];
+    VectorT & yk = Y[k];
+    MatrixT & block_k = blocks[k];
+
+    yk.resize(Jk.size());
+    block_k.resize(Jk.size(), Jk.size());
+    block_k.clear();
+
+    for (vcl_size_t i=0; i<Jk.size(); ++i)
+    {
+      vcl_size_t row_index = Jk[i];
+      std::map<unsigned int, NumericT> & A_row = A[row_index];
+
+      //fill y_k:
+      yk[i] = A_row[static_cast<unsigned int>(k)];
+
+      for (vcl_size_t j=0; j<Jk.size(); ++j)
+      {
+        vcl_size_t col_index = Jk[j];
+        if (col_index <= row_index && A_row.find(static_cast<unsigned int>(col_index)) != A_row.end()) //block is symmetric, thus store only lower triangular part
+          block_k(i, j) = A_row[static_cast<unsigned int>(col_index)];
+      }
+    }
+  }
+}
+
+
+//
+// Perform Cholesky factorization of A inplace. Cf. Schwarz: Numerische Mathematik, vol 5, p. 58
+//
+template<typename MatrixT>
+void cholesky_decompose(MatrixT & A)
+{
+  for (vcl_size_t k=0; k<A.size2(); ++k)
+  {
+    if (A(k,k) <= 0)
+    {
+      std::cout << "k: " << k << std::endl;
+      std::cout << "A(k,k): " << A(k,k) << std::endl;
+    }
+
+    assert(A(k,k) > 0 && bool("Matrix not positive definite in Cholesky factorization."));
+
+    A(k,k) = std::sqrt(A(k,k));
+
+    for (vcl_size_t i=k+1; i<A.size1(); ++i)
+    {
+      A(i,k) /= A(k,k);
+      for (vcl_size_t j=k+1; j<=i; ++j)
+        A(i,j) -= A(i,k) * A(j,k);
+    }
+  }
+}
+
+
+//
+// Compute x in Ax = b, where A is already Cholesky factored (A = L L^T)
+//
+template<typename MatrixT, typename VectorT>
+void cholesky_solve(MatrixT const & L, VectorT & b)
+{
+  // inplace forward solve L x = b
+  for (vcl_size_t i=0; i<L.size1(); ++i)
+  {
+    for (vcl_size_t j=0; j<i; ++j)
+      b[i] -= L(i,j) * b[j];
+    b[i] /= L(i,i);
+  }
+
+  // inplace backward solve L^T x = b:
+  for (vcl_size_t i=L.size1()-1;; --i)
+  {
+    for (vcl_size_t k=i+1; k<L.size1(); ++k)
+      b[i] -= L(k,i) * b[k];
+    b[i] /= L(i,i);
+
+    if (i==0) //vcl_size_t might be unsigned, therefore manual check for equality with zero here
+      break;
+  }
+}
+
+
+
+//
+// Compute the Cholesky factor L from the sparse vectors y_k
+//
+template<typename MatrixT, typename VectorT>
+void computeL(MatrixT const & A,
+              MatrixT       & L,
+              MatrixT       & L_trans,
+              std::vector<VectorT> & Y,
+              std::vector<std::vector<vcl_size_t> > & J)
+{
+  typedef typename VectorT::value_type                          NumericType;
+  typedef std::vector<std::map<unsigned int, NumericType> >     STLSparseMatrixType;
+
+  STLSparseMatrixType L_temp(A.size1());
+
+  for (vcl_size_t k=0; k<A.size1(); ++k)
+  {
+    std::vector<vcl_size_t> const & Jk = J[k];
+    VectorT const & yk = Y[k];
+
+    //compute L(k,k):
+    NumericType Lkk = A(k,k);
+    for (vcl_size_t i=0; i<Jk.size(); ++i)
+      Lkk -= A(Jk[i],k) * yk[i];
+
+    Lkk = NumericType(1) / std::sqrt(Lkk);
+    L_temp[k][static_cast<unsigned int>(k)] = Lkk;
+    L_trans(k,k) = Lkk;
+
+    //write lower diagonal entries:
+    for (vcl_size_t i=0; i<Jk.size(); ++i)
+    {
+      L_temp[Jk[i]][static_cast<unsigned int>(k)] = -Lkk * yk[i];
+      L_trans(k, Jk[i]) = -Lkk * yk[i];
+    }
+  } //for k
+
+
+  //build L from L_temp
+  for (vcl_size_t i=0; i<L_temp.size(); ++i)
+    for (typename std::map<unsigned int, NumericType>::const_iterator it = L_temp[i].begin();
+           it != L_temp[i].end();
+         ++it)
+      L(i, it->first) = it->second;
+}
+
+
+//
+// Top level FSPAI function
+//
+template<typename MatrixT>
+void computeFSPAI(MatrixT const & A,
+                  MatrixT const & PatternA,
+                  MatrixT       & L,
+                  MatrixT       & L_trans,
+                  fspai_tag)
+{
+  typedef typename MatrixT::value_type                    NumericT;
+  typedef boost::numeric::ublas::matrix<NumericT>         DenseMatrixType;
+  typedef std::vector<std::map<unsigned int, NumericT> >  SparseMatrixType;
+
+  //
+  // preprocessing: Store A in a STL container:
+  //
+  //std::cout << "Transferring to STL container:" << std::endl;
+  std::vector<std::vector<NumericT> >    y_k(A.size1());
+  SparseMatrixType   STL_A(A.size1());
+  sym_sparse_matrix_to_stl(A, STL_A);
+
+
+  //
+  // Step 1: Generate pattern indices
+  //
+  //std::cout << "computeFSPAI(): Generating pattern..." << std::endl;
+  std::vector<std::vector<vcl_size_t> > J(A.size1());
+  generateJ(PatternA, J);
+
+  //
+  // Step 2: Set up matrix blocks
+  //
+  //std::cout << "computeFSPAI(): Setting up matrix blocks..." << std::endl;
+  std::vector<DenseMatrixType>  subblocks_A(A.size1());
+  fill_blocks(STL_A, subblocks_A, J, y_k);
+  STL_A.clear(); //not needed anymore
+
+  //
+  // Step 3: Cholesky-factor blocks
+  //
+  //std::cout << "computeFSPAI(): Cholesky-factorization..." << std::endl;
+  for (vcl_size_t i=0; i<subblocks_A.size(); ++i)
+  {
+    //std::cout << "Block before: " << subblocks_A[i] << std::endl;
+    cholesky_decompose(subblocks_A[i]);
+    //std::cout << "Block after: " << subblocks_A[i] << std::endl;
+  }
+
+
+  /*vcl_size_t num_bytes = 0;
+  for (vcl_size_t i=0; i<subblocks_A.size(); ++i)
+    num_bytes += 8*subblocks_A[i].size1()*subblocks_A[i].size2();*/
+  //std::cout << "Memory for FSPAI matrix: " << num_bytes / (1024.0 * 1024.0) << " MB" << std::endl;
+
+  //
+  // Step 4: Solve for y_k
+  //
+  //std::cout << "computeFSPAI(): Cholesky-solve..." << std::endl;
+  for (vcl_size_t i=0; i<y_k.size(); ++i)
+  {
+    if (subblocks_A[i].size1() > 0) //block might be empty...
+    {
+      //y_k[i].resize(subblocks_A[i].size1());
+      //std::cout << "y_k[" << i << "]: ";
+      //for (vcl_size_t j=0; j<y_k[i].size(); ++j)
+      //  std::cout << y_k[i][j] << " ";
+      //std::cout << std::endl;
+      cholesky_solve(subblocks_A[i], y_k[i]);
+    }
+  }
+
+
+  //
+  // Step 5: Set up Cholesky factors L and L_trans
+  //
+  //std::cout << "computeFSPAI(): Computing L..." << std::endl;
+  L.resize(A.size1(), A.size2(), false);
+  L.reserve(A.nnz(), false);
+  L_trans.resize(A.size1(), A.size2(), false);
+  L_trans.reserve(A.nnz(), false);
+  computeL(A, L, L_trans, y_k, J);
+
+  //std::cout << "L: " << L << std::endl;
+}
+
+
+
+}
+}
+}
+}
+
+#endif