You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:14 UTC

[17/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
new file mode 100644
index 0000000..56e3c14
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
@@ -0,0 +1,705 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp
+    @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/linalg/host_based/common.hpp"
+
+
+#ifdef VIENNACL_WITH_AVX2
+#include "immintrin.h"
+#endif
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+
+
+#ifdef VIENNACL_WITH_AVX2
+inline
+unsigned int row_C_scan_symbolic_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end,
+                                             int const *B_row_buffer, int const *B_col_buffer, int B_size2,
+                                             int *row_C_vector_output)
+{
+  __m256i avx_all_ones    = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i avx_all_bsize2  = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
+
+  __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
+  __m256i avx_load_mask2 = avx_load_mask;
+
+  __m256i avx_row_indices = _mm256_set1_epi32(0);
+          avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
+            avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256i avx_row_start   = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer,   avx_row_indices, avx_load_mask, 4);
+            avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256i avx_row_end     = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
+
+          avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+  __m256i avx_index_front = avx_all_bsize2;
+  avx_index_front         = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+  int *output_ptr = row_C_vector_output;
+
+  while (1)
+  {
+    // get minimum index in current front:
+    __m256i avx_index_min1 = avx_index_front;
+    __m256i avx_temp       = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
+
+    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(78));    // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
+
+    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(177));    // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
+
+    int min_index_in_front = ((int*)&avx_index_min1)[0];
+    // check for end of merge operation:
+    if (min_index_in_front == B_size2)
+      break;
+
+    // write current entry:
+    *output_ptr = min_index_in_front;
+    ++output_ptr;
+
+    // advance index front where equal to minimum index:
+    avx_load_mask   = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
+    // first part: set index to B_size2 if equal to minimum index:
+    avx_temp        = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
+    avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
+    // second part: increment row_start registers where minimum found:
+    avx_temp        = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
+    avx_row_start   = _mm256_add_epi32(avx_row_start, avx_temp);
+    // third part part: load new data where more entries available:
+    avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+    avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+  }
+
+  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+#endif
+
+/** @brief Merges up to IndexNum rows from B into the result buffer.
+*
+* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1)
+**/
+template<unsigned int IndexNum>
+unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B,
+                                          unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
+                                          unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end,
+                                          unsigned int *row_C_vector_output)
+{
+  unsigned int index_front[IndexNum+1];
+  unsigned int const *index_front_start[IndexNum+1];
+  unsigned int const *index_front_end[IndexNum+1];
+
+  // Set up pointers for loading the indices:
+  for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
+  {
+    index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
+    index_front_end[i]   = B_col_buffer + B_row_buffer[*row_indices_B + 1];
+  }
+  index_front_start[IndexNum] = row_C_vector_input;
+  index_front_end[IndexNum]   = row_C_vector_input_end;
+
+  // load indices:
+  for (unsigned int i=0; i<=IndexNum; ++i)
+    index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+  unsigned int *output_ptr = row_C_vector_output;
+
+  while (1)
+  {
+    // get minimum index in current front:
+    unsigned int min_index_in_front = B_size2;
+    for (unsigned int i=0; i<=IndexNum; ++i)
+      min_index_in_front = std::min(min_index_in_front, index_front[i]);
+
+    if (min_index_in_front == B_size2) // we're done
+      break;
+
+    // advance index front where equal to minimum index:
+    for (unsigned int i=0; i<=IndexNum; ++i)
+    {
+      if (index_front[i] == min_index_in_front)
+      {
+        index_front_start[i] += 1;
+        index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+      }
+    }
+
+    // write current entry:
+    *output_ptr = min_index_in_front;
+    ++output_ptr;
+  }
+
+  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+
+struct spgemm_output_write_enabled  { static void apply(unsigned int *ptr, unsigned int value) { *ptr = value; } };
+struct spgemm_output_write_disabled { static void apply(unsigned int *   , unsigned int      ) {               } };
+
+template<typename OutputWriterT>
+unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end,
+                                          unsigned int const *input2_begin, unsigned int const *input2_end,
+                                          unsigned int termination_index,
+                                          unsigned int *output_begin)
+{
+  unsigned int *output_ptr = output_begin;
+
+  unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
+  unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
+  while (1)
+  {
+    unsigned int min_index = std::min(val_1, val_2);
+
+    if (min_index == termination_index)
+      break;
+
+    if (min_index == val_1)
+    {
+      ++input1_begin;
+      val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
+    }
+
+    if (min_index == val_2)
+    {
+      ++input2_begin;
+      val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
+    }
+
+    // write current entry:
+    OutputWriterT::apply(output_ptr, min_index); // *output_ptr = min_index;    if necessary
+    ++output_ptr;
+  }
+
+  return static_cast<unsigned int>(output_ptr - output_begin);
+}
+
+inline
+unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer,
+                                        unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
+                                        unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
+{
+  // Trivial case: row length 0:
+  if (row_start_A == row_end_A)
+    return 0;
+
+  // Trivial case: row length 1:
+  if (row_end_A - row_start_A == 1)
+  {
+    unsigned int A_col = A_col_buffer[row_start_A];
+    return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
+  }
+
+  // Optimizations for row length 2:
+  unsigned int row_C_len = 0;
+  if (row_end_A - row_start_A == 2)
+  {
+    unsigned int A_col_1 = A_col_buffer[row_start_A];
+    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+    return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+                                                                      B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+                                                                      B_size2,
+                                                                      row_C_vector_1);
+  }
+  else // for more than two rows we can safely merge the first two:
+  {
+#ifdef VIENNACL_WITH_AVX2
+    row_C_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
+                                                (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
+                                                (int*)row_C_vector_1);
+    row_start_A += 8;
+#else
+    unsigned int A_col_1 = A_col_buffer[row_start_A];
+    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+    row_C_len =  row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+                                                                           B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+                                                                           B_size2,
+                                                                           row_C_vector_1);
+    row_start_A += 2;
+#endif
+  }
+
+  // all other row lengths:
+  while (row_end_A > row_start_A)
+  {
+#ifdef VIENNACL_WITH_AVX2
+    if (row_end_A - row_start_A > 2) // we deal with one or two remaining rows more efficiently below:
+    {
+      unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
+                                                                (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
+                                                                (int*)row_C_vector_3);
+      if (row_start_A + 8 >= row_end_A)
+        row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+                                                                              row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                              B_size2,
+                                                                              row_C_vector_2);
+      else
+        row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+                                                                               row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                               B_size2,
+                                                                               row_C_vector_2);
+      row_start_A += 8;
+    }
+    else
+#endif
+    if (row_start_A == row_end_A - 1) // last merge operation. No need to write output
+    {
+      // process last row
+      unsigned int row_index_B = A_col_buffer[row_start_A];
+      return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
+                                                                        row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                        B_size2,
+                                                                        row_C_vector_2);
+    }
+    else if (row_start_A + 1 < row_end_A)// at least two more rows left, so merge them
+    {
+      // process single row:
+      unsigned int A_col_1 = A_col_buffer[row_start_A];
+      unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+      unsigned int merged_len =  row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+                                                                                           B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+                                                                                           B_size2,
+                                                                                           row_C_vector_3);
+      if (row_start_A + 2 == row_end_A) // last merge does not need a write:
+        return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+                                                                          row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                          B_size2,
+                                                                          row_C_vector_2);
+      else
+        row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+                                                                              row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                              B_size2,
+                                                                              row_C_vector_2);
+      row_start_A += 2;
+    }
+    else // at least two more rows left
+    {
+      // process single row:
+      unsigned int row_index_B = A_col_buffer[row_start_A];
+      row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
+                                                                            row_C_vector_1, row_C_vector_1 + row_C_len,
+                                                                            B_size2,
+                                                                            row_C_vector_2);
+      ++row_start_A;
+    }
+
+    std::swap(row_C_vector_1, row_C_vector_2);
+  }
+
+  return row_C_len;
+}
+
+//////////////////////////////
+
+/** @brief Merges up to IndexNum rows from B into the result buffer.
+*
+* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1)
+**/
+template<unsigned int IndexNum, typename NumericT>
+unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A,
+                                          unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
+                                          unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values,
+                                          unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
+{
+  unsigned int index_front[IndexNum+1];
+  unsigned int const *index_front_start[IndexNum+1];
+  unsigned int const *index_front_end[IndexNum+1];
+  NumericT const * value_front_start[IndexNum+1];
+  NumericT values_A[IndexNum+1];
+
+  // Set up pointers for loading the indices:
+  for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
+  {
+    unsigned int row_B = *row_indices_B;
+
+    index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
+    index_front_end[i]   = B_col_buffer + B_row_buffer[row_B + 1];
+    value_front_start[i] = B_elements   + B_row_buffer[row_B];
+    values_A[i]          = val_A[i];
+  }
+  index_front_start[IndexNum] = row_C_vector_input;
+  index_front_end[IndexNum]   = row_C_vector_input_end;
+  value_front_start[IndexNum] = row_C_vector_input_values;
+  values_A[IndexNum]          = NumericT(1);
+
+  // load indices:
+  for (unsigned int i=0; i<=IndexNum; ++i)
+    index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+  unsigned int *output_ptr = row_C_vector_output;
+
+  while (1)
+  {
+    // get minimum index in current front:
+    unsigned int min_index_in_front = B_size2;
+    for (unsigned int i=0; i<=IndexNum; ++i)
+      min_index_in_front = std::min(min_index_in_front, index_front[i]);
+
+    if (min_index_in_front == B_size2) // we're done
+      break;
+
+    // advance index front where equal to minimum index:
+    NumericT row_C_value = 0;
+    for (unsigned int i=0; i<=IndexNum; ++i)
+    {
+      if (index_front[i] == min_index_in_front)
+      {
+        index_front_start[i] += 1;
+        index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+        row_C_value += values_A[i] * *value_front_start[i];
+        value_front_start[i] += 1;
+      }
+    }
+
+    // write current entry:
+    *output_ptr = min_index_in_front;
+    ++output_ptr;
+    *row_C_vector_output_values = row_C_value;
+    ++row_C_vector_output_values;
+  }
+
+  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+
+
+
+#ifdef VIENNACL_WITH_AVX2
+inline
+unsigned int row_C_scan_numeric_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, double const *values_A,
+                                             int const *B_row_buffer, int const *B_col_buffer, double const *B_elements,
+                                             int B_size2,
+                                             int *row_C_vector_output, double *row_C_vector_output_values)
+{
+  __m256i avx_all_ones    = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+  __m256i avx_all_bsize2  = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
+
+  __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
+  __m256i avx_load_mask2 = avx_load_mask;
+
+  __m256i avx_row_indices = _mm256_set1_epi32(0);
+          avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
+
+  // load values from A:
+  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256d avx_value_A_low  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+                                                      values_A,                  //base ptr
+                                                      _mm256_extractf128_si256(avx_row_indices_offsets, 0),                           //indices
+                                                      _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256d avx_value_A_high  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+                                                       values_A,                  //base ptr
+                                                       _mm256_extractf128_si256(avx_row_indices_offsets, 1),                           //indices
+                                                       _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+
+            avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256i avx_row_start   = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer,   avx_row_indices, avx_load_mask, 4);
+            avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256i avx_row_end     = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
+
+          avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+          avx_load_mask2  = avx_load_mask;
+  __m256i avx_index_front = avx_all_bsize2;
+  avx_index_front         = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+  // load front values from B:
+  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256d avx_value_front_low  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+                                                          B_elements,                  //base ptr
+                                                          _mm256_extractf128_si256(avx_row_start, 0),                           //indices
+                                                          _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+  avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+  __m256d avx_value_front_high  = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+                                                           B_elements,                  //base ptr
+                                                           _mm256_extractf128_si256(avx_row_start, 1),                           //indices
+                                                           _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+  int *output_ptr = row_C_vector_output;
+
+  while (1)
+  {
+    // get minimum index in current front:
+    __m256i avx_index_min1 = avx_index_front;
+    __m256i avx_temp       = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
+
+    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(78));    // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
+
+    avx_temp       = _mm256_shuffle_epi32(avx_index_min1, int(177));    // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
+    avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
+
+    int min_index_in_front = ((int*)&avx_index_min1)[0];
+    // check for end of merge operation:
+    if (min_index_in_front == B_size2)
+      break;
+
+    // accumulate value (can certainly be done more elegantly...)
+    double value = 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[0]) ? ((double*)&avx_value_front_low)[0] * ((double*)&avx_value_A_low)[0] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[1]) ? ((double*)&avx_value_front_low)[1] * ((double*)&avx_value_A_low)[1] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[2]) ? ((double*)&avx_value_front_low)[2] * ((double*)&avx_value_A_low)[2] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[3]) ? ((double*)&avx_value_front_low)[3] * ((double*)&avx_value_A_low)[3] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[4]) ? ((double*)&avx_value_front_high)[0] * ((double*)&avx_value_A_high)[0] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[5]) ? ((double*)&avx_value_front_high)[1] * ((double*)&avx_value_A_high)[1] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[6]) ? ((double*)&avx_value_front_high)[2] * ((double*)&avx_value_A_high)[2] : 0;
+    value += (min_index_in_front == ((int*)&avx_index_front)[7]) ? ((double*)&avx_value_front_high)[3] * ((double*)&avx_value_A_high)[3] : 0;
+    *row_C_vector_output_values = value;
+    ++row_C_vector_output_values;
+
+    // write current entry:
+    *output_ptr = min_index_in_front;
+    ++output_ptr;
+
+    // advance index front where equal to minimum index:
+    avx_load_mask   = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
+    // first part: set index to B_size2 if equal to minimum index:
+    avx_temp        = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
+    avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
+    // second part: increment row_start registers where minimum found:
+    avx_temp        = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
+    avx_row_start   = _mm256_add_epi32(avx_row_start, avx_temp);
+    // third part part: load new data where more entries available:
+    avx_load_mask   = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+    avx_load_mask2  = avx_load_mask;
+    avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+    // load new values where necessary:
+    avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+    avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, //src
+                                            B_elements,                  //base ptr
+                                            _mm256_extractf128_si256(avx_row_start, 0),                           //indices
+                                            _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+
+    avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+    avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, //src
+                                    B_elements,                  //base ptr
+                                    _mm256_extractf128_si256(avx_row_start, 1),                           //indices
+                                    _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+    //multiply new entries:
+
+  }
+
+  return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+#endif
+
+
+template<typename NumericT>
+unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1,
+                                         unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2,
+                                         unsigned int termination_index,
+                                         unsigned int *output_index_begin, NumericT *output_values_begin)
+{
+  unsigned int *output_ptr = output_index_begin;
+
+  unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
+  unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
+
+  while (1)
+  {
+    unsigned int min_index = std::min(index1, index2);
+    NumericT value = 0;
+
+    if (min_index == termination_index)
+      break;
+
+    if (min_index == index1)
+    {
+      ++input1_index_begin;
+      index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
+
+      value += factor1 * *input1_values_begin;
+      ++input1_values_begin;
+    }
+
+    if (min_index == index2)
+    {
+      ++input2_index_begin;
+      index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
+
+      value += factor2 * *input2_values_begin;
+      ++input2_values_begin;
+    }
+
+    // write current entry:
+    *output_ptr = min_index;
+    ++output_ptr;
+    *output_values_begin = value;
+    ++output_values_begin;
+  }
+
+  return static_cast<unsigned int>(output_ptr - output_index_begin);
+}
+
+template<typename NumericT>
+void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements,
+                               unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
+                               unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements,
+                               unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values,
+                               unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values,
+                               unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
+{
+  (void)row_end_C;
+
+  // Trivial case: row length 0:
+  if (row_start_A == row_end_A)
+    return;
+
+  // Trivial case: row length 1:
+  if (row_end_A - row_start_A == 1)
+  {
+    unsigned int A_col = A_col_buffer[row_start_A];
+    unsigned int B_end = B_row_buffer[A_col + 1];
+    NumericT A_value   = A_elements[row_start_A];
+    C_col_buffer += row_start_C;
+    C_elements += row_start_C;
+    for (unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements)
+    {
+      *C_col_buffer = B_col_buffer[j];
+      *C_elements = A_value * B_elements[j];
+    }
+    return;
+  }
+
+  unsigned int row_C_len = 0;
+  if (row_end_A - row_start_A == 2) // directly merge to C:
+  {
+    unsigned int A_col_1 = A_col_buffer[row_start_A];
+    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+    unsigned int B_offset_1 = B_row_buffer[A_col_1];
+    unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+    row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+                                B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+                                B_size2,
+                                C_col_buffer + row_start_C, C_elements + row_start_C);
+    return;
+  }
+#ifdef VIENNACL_WITH_AVX2
+  else if (row_end_A - row_start_A > 10) // safely merge eight rows into temporary buffer:
+  {
+    row_C_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
+                                               (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
+                                               (int*)row_C_vector_1, row_C_vector_1_values);
+    row_start_A += 8;
+  }
+#endif
+  else // safely merge two rows into temporary buffer:
+  {
+    unsigned int A_col_1 = A_col_buffer[row_start_A];
+    unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+    unsigned int B_offset_1 = B_row_buffer[A_col_1];
+    unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+    row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+                                            B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+                                            B_size2,
+                                            row_C_vector_1, row_C_vector_1_values);
+    row_start_A += 2;
+  }
+
+  // process remaining rows:
+  while (row_end_A > row_start_A)
+  {
+#ifdef VIENNACL_WITH_AVX2
+    if (row_end_A - row_start_A > 9) // code in other if-conditionals ensures that values get written to C
+    {
+      unsigned int merged_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
+                                                               (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
+                                                               (int*)row_C_vector_3, row_C_vector_3_values);
+      row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
+                                              row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+                                              B_size2,
+                                              row_C_vector_2, row_C_vector_2_values);
+      row_start_A += 8;
+    }
+    else
+#endif
+    if (row_start_A + 1 == row_end_A) // last row to merge, write directly to C:
+    {
+      unsigned int A_col    = A_col_buffer[row_start_A];
+      unsigned int B_offset = B_row_buffer[A_col];
+
+      row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
+                                              row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+                                              B_size2,
+                                              C_col_buffer + row_start_C, C_elements + row_start_C);
+      return;
+    }
+    else if (row_start_A + 2 < row_end_A)// at least three more rows left, so merge two
+    {
+      // process single row:
+      unsigned int A_col_1 = A_col_buffer[row_start_A];
+      unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+      unsigned int B_offset_1 = B_row_buffer[A_col_1];
+      unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+      unsigned int merged_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+                                                            B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+                                                            B_size2,
+                                                            row_C_vector_3, row_C_vector_3_values);
+      row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
+                                              row_C_vector_1, row_C_vector_1 + row_C_len,  row_C_vector_1_values, NumericT(1.0),
+                                              B_size2,
+                                              row_C_vector_2, row_C_vector_2_values);
+      row_start_A += 2;
+    }
+    else
+    {
+      unsigned int A_col    = A_col_buffer[row_start_A];
+      unsigned int B_offset = B_row_buffer[A_col];
+
+      row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
+                                              row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+                                              B_size2,
+                                              row_C_vector_2, row_C_vector_2_values);
+      ++row_start_A;
+    }
+
+    std::swap(row_C_vector_1,        row_C_vector_2);
+    std::swap(row_C_vector_1_values, row_C_vector_2_values);
+  }
+}
+
+
+} // namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
new file mode 100644
index 0000000..b4944a2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
@@ -0,0 +1,1188 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/vector_operations.hpp
+    @brief Implementations of vector operations using a plain single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include <cmath>
+#include <algorithm>  //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
+  #define VIENNACL_OPENMP_VECTOR_MIN_SIZE  5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+  template<typename NumericT>
+  NumericT flip_sign(NumericT val) { return -val; }
+  inline unsigned long  flip_sign(unsigned long  val) { return val; }
+  inline unsigned int   flip_sign(unsigned int   val) { return val; }
+  inline unsigned short flip_sign(unsigned short val) { return val; }
+  inline unsigned char  flip_sign(unsigned char  val) { return val; }
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+  DestNumericT      * data_dest = detail::extract_raw_pointer<DestNumericT>(dest);
+  SrcNumericT const * data_src  = detail::extract_raw_pointer<SrcNumericT>(src);
+
+  vcl_size_t start_dest = viennacl::traits::start(dest);
+  vcl_size_t inc_dest   = viennacl::traits::stride(dest);
+  vcl_size_t size_dest  = viennacl::traits::size(dest);
+
+  vcl_size_t start_src = viennacl::traits::start(src);
+  vcl_size_t inc_src   = viennacl::traits::stride(src);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size_dest > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size_dest); ++i)
+    data_dest[static_cast<vcl_size_t>(i)*inc_dest+start_dest] = static_cast<DestNumericT>(data_src[static_cast<vcl_size_t>(i)*inc_src+start_src]);
+}
+
+template<typename NumericT, typename ScalarT1>
+void av(vector_base<NumericT> & vec1,
+        vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  typedef NumericT        value_type;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = detail::flip_sign(data_alpha);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  if (reciprocal_alpha)
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+    for (long i = 0; i < static_cast<long>(size1); ++i)
+      data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha;
+  }
+  else
+  {
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+    for (long i = 0; i < static_cast<long>(size1); ++i)
+      data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha;
+  }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT> & vec1,
+          vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha,
+          vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t /* len_beta */,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef NumericT      value_type;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+  value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = detail::flip_sign(data_alpha);
+
+  value_type data_beta = beta;
+  if (flip_sign_beta)
+    data_beta = detail::flip_sign(data_beta);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  vcl_size_t start3 = viennacl::traits::start(vec3);
+  vcl_size_t inc3   = viennacl::traits::stride(vec3);
+
+  if (reciprocal_alpha)
+  {
+    if (reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+    }
+  }
+  else
+  {
+    if (reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+    }
+  }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT> & vec1,
+            vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+            vector_base<NumericT> const & vec3, ScalarT2 const & beta,  vcl_size_t /*len_beta*/,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef NumericT        value_type;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+  value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+  value_type data_alpha = alpha;
+  if (flip_sign_alpha)
+    data_alpha = detail::flip_sign(data_alpha);
+
+  value_type data_beta = beta;
+  if (flip_sign_beta)
+    data_beta = detail::flip_sign(data_beta);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  vcl_size_t start3 = viennacl::traits::start(vec3);
+  vcl_size_t inc3   = viennacl::traits::stride(vec3);
+
+  if (reciprocal_alpha)
+  {
+    if (reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+    }
+  }
+  else
+  {
+    if (reciprocal_beta)
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+    }
+    else
+    {
+#ifdef VIENNACL_WITH_OPENMP
+      #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+      for (long i = 0; i < static_cast<long>(size1); ++i)
+        data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+    }
+  }
+}
+
+
+
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1   The vector to which the value should be assigned
+* @param alpha  The value to be assigned
+* @param up_to_internal_size  Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template<typename NumericT>
+void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool up_to_internal_size = false)
+{
+  typedef NumericT       value_type;
+
+  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+  vcl_size_t loop_bound  = up_to_internal_size ? vec1.internal_size() : size1;  //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+  value_type data_alpha = static_cast<value_type>(alpha);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (loop_bound > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(loop_bound); ++i)
+    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha;
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1   The first vector (or -range, or -slice)
+* @param vec2   The second vector (or -range, or -slice)
+*/
+template<typename NumericT>
+void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+{
+  typedef NumericT      value_type;
+
+  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+  {
+    value_type temp = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
+    data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = temp;
+  }
+}
+
+
+///////////////////////// Elementwise operations /////////////
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+*
+* @param vec1   The result vector (or -range, or -slice)
+* @param proxy  The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+  typedef NumericT                                           value_type;
+  typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> >    OpFunctor;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+  value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+  vcl_size_t inc2   = viennacl::traits::stride(proxy.lhs());
+
+  vcl_size_t start3 = viennacl::traits::start(proxy.rhs());
+  vcl_size_t inc3   = viennacl::traits::stride(proxy.rhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+    OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2], data_vec3[static_cast<vcl_size_t>(i)*inc3+start3]);
+}
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3    (using MATLAB syntax)
+*
+* @param vec1   The result vector (or -range, or -slice)
+* @param proxy  The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+                vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<OpT> > const & proxy)
+{
+  typedef NumericT      value_type;
+  typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> >    OpFunctor;
+
+  value_type       * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+  vcl_size_t inc2   = viennacl::traits::stride(proxy.lhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+    OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]);
+}
+
+
+///////////////////////// Norms and inner product ///////////////////
+
+
+//implementation of inner product:
+
+namespace detail
+{
+
+// the following circumvents problems when trying to use a variable of template parameter type for a reduction.
+// Such a behavior is not covered by the OpenMP standard, hence we manually apply some preprocessor magic to resolve the problem.
+// See https://github.com/viennacl/viennacl-dev/issues/112 for a detailed explanation and discussion.
+
+#define VIENNACL_INNER_PROD_IMPL_1(RESULTSCALART, TEMPSCALART) \
+  inline RESULTSCALART inner_prod_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1, \
+                                       RESULTSCALART const * data_vec2, vcl_size_t start2, vcl_size_t inc2) { \
+    TEMPSCALART temp = 0;
+
+#define VIENNACL_INNER_PROD_IMPL_2(RESULTSCALART) \
+    for (long i = 0; i < static_cast<long>(size1); ++i) \
+      temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] * data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; \
+    return static_cast<RESULTSCALART>(temp); \
+  }
+
+// char
+VIENNACL_INNER_PROD_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(char)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned char)
+
+
+// short
+VIENNACL_INNER_PROD_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(short)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned short)
+
+
+// int
+VIENNACL_INNER_PROD_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(int)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned int)
+
+
+// long
+VIENNACL_INNER_PROD_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(long)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned long)
+
+
+// float
+VIENNACL_INNER_PROD_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(float)
+
+// double
+VIENNACL_INNER_PROD_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(double)
+
+#undef VIENNACL_INNER_PROD_IMPL_1
+#undef VIENNACL_INNER_PROD_IMPL_2
+}
+
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template<typename NumericT, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT> const & vec1,
+                     vector_base<NumericT> const & vec2,
+                     ScalarT & result)
+{
+  typedef NumericT      value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  result = detail::inner_prod_impl(data_vec1, start1, inc1, size1,
+                                   data_vec2, start2, inc2);  //Note: Assignment to result might be expensive, thus a temporary is introduced here
+}
+
+template<typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+                     vector_tuple<NumericT> const & vec_tuple,
+                     vector_base<NumericT> & result)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_x = detail::extract_raw_pointer<value_type>(x);
+
+  vcl_size_t start_x = viennacl::traits::start(x);
+  vcl_size_t inc_x   = viennacl::traits::stride(x);
+  vcl_size_t size_x  = viennacl::traits::size(x);
+
+  std::vector<value_type> temp(vec_tuple.const_size());
+  std::vector<value_type const *> data_y(vec_tuple.const_size());
+  std::vector<vcl_size_t> start_y(vec_tuple.const_size());
+  std::vector<vcl_size_t> stride_y(vec_tuple.const_size());
+
+  for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j)
+  {
+    data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j));
+    start_y[j] = viennacl::traits::start(vec_tuple.const_at(j));
+    stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j));
+  }
+
+  // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach...
+  for (vcl_size_t i = 0; i < size_x; ++i)
+  {
+    value_type entry_x = data_x[i*inc_x+start_x];
+    for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+      temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]];
+  }
+
+  for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+    result[j] = temp[j];  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+
+namespace detail
+{
+
+#define VIENNACL_NORM_1_IMPL_1(RESULTSCALART, TEMPSCALART) \
+  inline RESULTSCALART norm_1_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \
+    TEMPSCALART temp = 0;
+
+#define VIENNACL_NORM_1_IMPL_2(RESULTSCALART, TEMPSCALART) \
+    for (long i = 0; i < static_cast<long>(size1); ++i) \
+      temp += static_cast<TEMPSCALART>(std::fabs(static_cast<double>(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]))); \
+    return static_cast<RESULTSCALART>(temp); \
+  }
+
+// char
+VIENNACL_NORM_1_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(char, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned char, int)
+
+// short
+VIENNACL_NORM_1_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(short, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned short, int)
+
+
+// int
+VIENNACL_NORM_1_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(int, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned int, unsigned int)
+
+
+// long
+VIENNACL_NORM_1_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(long, long)
+
+VIENNACL_NORM_1_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned long, unsigned long)
+
+
+// float
+VIENNACL_NORM_1_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(float, float)
+
+// double
+VIENNACL_NORM_1_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(double, double)
+
+#undef VIENNACL_NORM_1_IMPL_1
+#undef VIENNACL_NORM_1_IMPL_2
+
+}
+
+/** @brief Computes the l^1-norm of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_1_impl(vector_base<NumericT> const & vec1,
+                 ScalarT & result)
+{
+  typedef NumericT        value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  result = detail::norm_1_impl(data_vec1, start1, inc1, size1);  //Note: Assignment to result might be expensive, thus using a temporary for accumulation
+}
+
+
+
+namespace detail
+{
+
+#define VIENNACL_NORM_2_IMPL_1(RESULTSCALART, TEMPSCALART) \
+  inline RESULTSCALART norm_2_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \
+    TEMPSCALART temp = 0;
+
+#define VIENNACL_NORM_2_IMPL_2(RESULTSCALART, TEMPSCALART) \
+    for (long i = 0; i < static_cast<long>(size1); ++i) { \
+      RESULTSCALART data = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; \
+      temp += static_cast<TEMPSCALART>(data * data); \
+    } \
+    return static_cast<RESULTSCALART>(temp); \
+  }
+
+// char
+VIENNACL_NORM_2_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(char, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned char, int)
+
+
+// short
+VIENNACL_NORM_2_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(short, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned short, int)
+
+
+// int
+VIENNACL_NORM_2_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(int, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned int, unsigned int)
+
+
+// long
+VIENNACL_NORM_2_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(long, long)
+
+VIENNACL_NORM_2_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned long, unsigned long)
+
+
+// float
+VIENNACL_NORM_2_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(float, float)
+
+// double
+VIENNACL_NORM_2_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(double, double)
+
+#undef VIENNACL_NORM_2_IMPL_1
+#undef VIENNACL_NORM_2_IMPL_2
+
+}
+
+
+/** @brief Computes the l^2-norm of a vector - implementation
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_2_impl(vector_base<NumericT> const & vec1,
+                 ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  result = std::sqrt(detail::norm_2_impl(data_vec1, start1, inc1, size1));  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the supremum-norm of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_inf_impl(vector_base<NumericT> const & vec1,
+                   ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t thread_count=1;
+
+  #ifdef VIENNACL_WITH_OPENMP
+  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+      thread_count = omp_get_max_threads();
+  #endif
+
+  std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  {
+    vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+    id = omp_get_thread_num();
+#endif
+
+    vcl_size_t begin = (size1 * id) / thread_count;
+    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
+    temp[id]         = 0;
+
+    for (vcl_size_t i = begin; i < end; ++i)
+      temp[id] = std::max<value_type>(temp[id], static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1]))));  //casting to double in order to avoid problems if T is an integer type
+  }
+  for (vcl_size_t i = 1; i < thread_count; ++i)
+    temp[0] = std::max<value_type>( temp[0], temp[i]);
+  result  = temp[0];
+}
+
+//This function should return a CPU scalar, otherwise statements like
+// vcl_rhs[index_norm_inf(vcl_rhs)]
+// are ambiguous
+/** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+*
+* @param vec1 The vector
+* @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+*/
+template<typename NumericT>
+vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1)
+{
+  typedef NumericT      value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+  vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+      thread_count = omp_get_max_threads();
+#endif
+
+  std::vector<value_type> temp(thread_count);
+  std::vector<vcl_size_t> index(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  {
+    vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+    id = omp_get_thread_num();
+#endif
+    vcl_size_t begin = (size1 * id) / thread_count;
+    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
+    index[id]        = start1;
+    temp[id]         = 0;
+    value_type data;
+
+    for (vcl_size_t i = begin; i < end; ++i)
+    {
+      data = static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])));  //casting to double in order to avoid problems if T is an integer type
+      if (data > temp[id])
+      {
+        index[id] = i;
+        temp[id]  = data;
+      }
+    }
+  }
+  for (vcl_size_t i = 1; i < thread_count; ++i)
+  {
+    if (temp[i] > temp[0])
+    {
+      index[0] = index[i];
+      temp[0] = temp[i];
+    }
+  }
+  return index[0];
+}
+
+/** @brief Computes the maximum of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void max_impl(vector_base<NumericT> const & vec1,
+              ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+      thread_count = omp_get_max_threads();
+#endif
+
+  std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  {
+    vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+    id = omp_get_thread_num();
+#endif
+    vcl_size_t begin = (size1 * id) / thread_count;
+    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
+    temp[id]         = data_vec1[start1];
+
+    for (vcl_size_t i = begin; i < end; ++i)
+    {
+      value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function
+      temp[id] = std::max<value_type>(temp[id],v);
+    }
+  }
+  for (vcl_size_t i = 1; i < thread_count; ++i)
+    temp[0] = std::max<value_type>( temp[0], temp[i]);
+  result  = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the minimum of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void min_impl(vector_base<NumericT> const & vec1,
+              ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+  if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+      thread_count = omp_get_max_threads();
+#endif
+
+  std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  {
+    vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+    id = omp_get_thread_num();
+#endif
+    vcl_size_t begin = (size1 * id) / thread_count;
+    vcl_size_t end   = (size1 * (id + 1)) / thread_count;
+    temp[id]         = data_vec1[start1];
+
+    for (vcl_size_t i = begin; i < end; ++i)
+    {
+      value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function
+      temp[id] = std::min<value_type>(temp[id],v);
+    }
+  }
+  for (vcl_size_t i = 1; i < thread_count; ++i)
+    temp[0] = std::min<value_type>( temp[0], temp[i]);
+  result  = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the sum of all elements from the vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void sum_impl(vector_base<NumericT> const & vec1,
+              ScalarT & result)
+{
+  typedef NumericT       value_type;
+
+  value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  value_type temp = 0;
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for reduction(+:temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+    temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+
+  result = temp;  //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes a plane rotation of two vectors.
+*
+* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+*
+* @param vec1   The first vector
+* @param vec2   The second vector
+* @param alpha  The first transformation coefficient
+* @param beta   The second transformation coefficient
+*/
+template<typename NumericT>
+void plane_rotation(vector_base<NumericT> & vec1,
+                    vector_base<NumericT> & vec2,
+                    NumericT alpha, NumericT beta)
+{
+  typedef NumericT  value_type;
+
+  value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+  value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+  vcl_size_t start1 = viennacl::traits::start(vec1);
+  vcl_size_t inc1   = viennacl::traits::stride(vec1);
+  vcl_size_t size1  = viennacl::traits::size(vec1);
+
+  vcl_size_t start2 = viennacl::traits::start(vec2);
+  vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+  value_type data_alpha = alpha;
+  value_type data_beta  = beta;
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+  for (long i = 0; i < static_cast<long>(size1); ++i)
+  {
+    value_type temp1 = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+    value_type temp2 = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
+
+    data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha * temp1 + data_beta * temp2;
+    data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_alpha * temp2 - data_beta * temp1;
+  }
+}
+
+namespace detail
+{
+  /** @brief Implementation of inclusive_scan and exclusive_scan for the host (OpenMP) backend. */
+  template<typename NumericT>
+  void vector_scan_impl(vector_base<NumericT> const & vec1,
+                        vector_base<NumericT>       & vec2,
+                        bool is_inclusive)
+  {
+    NumericT const * data_vec1 = detail::extract_raw_pointer<NumericT>(vec1);
+    NumericT       * data_vec2 = detail::extract_raw_pointer<NumericT>(vec2);
+
+    vcl_size_t start1 = viennacl::traits::start(vec1);
+    vcl_size_t inc1   = viennacl::traits::stride(vec1);
+    vcl_size_t size1  = viennacl::traits::size(vec1);
+    if (size1 < 1)
+      return;
+
+    vcl_size_t start2 = viennacl::traits::start(vec2);
+    vcl_size_t inc2   = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+    if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+    {
+      std::vector<NumericT> thread_results(omp_get_max_threads());
+
+      // inclusive scan each thread segment:
+      #pragma omp parallel
+      {
+        vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
+        vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
+        vcl_size_t thread_stop  = std::min<vcl_size_t>(thread_start + work_per_thread, size1);
+
+        NumericT thread_sum = 0;
+        for(vcl_size_t i = thread_start; i < thread_stop; i++)
+          thread_sum += data_vec1[i * inc1 + start1];
+
+        thread_results[omp_get_thread_num()] = thread_sum;
+      }
+
+      // exclusive-scan of thread results:
+      NumericT current_offset = 0;
+      for (vcl_size_t i=0; i<thread_results.size(); ++i)
+      {
+        NumericT tmp = thread_results[i];
+        thread_results[i] = current_offset;
+        current_offset += tmp;
+      }
+
+      // exclusive/inclusive scan of each segment with correct offset:
+      #pragma omp parallel
+      {
+        vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
+        vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
+        vcl_size_t thread_stop  = std::min<vcl_size_t>(thread_start + work_per_thread, size1);
+
+        NumericT thread_sum = thread_results[omp_get_thread_num()];
+        if (is_inclusive)
+        {
+          for(vcl_size_t i = thread_start; i < thread_stop; i++)
+          {
+            thread_sum += data_vec1[i * inc1 + start1];
+            data_vec2[i * inc2 + start2] = thread_sum;
+          }
+        }
+        else
+        {
+          for(vcl_size_t i = thread_start; i < thread_stop; i++)
+          {
+            NumericT tmp = data_vec1[i * inc1 + start1];
+            data_vec2[i * inc2 + start2] = thread_sum;
+            thread_sum += tmp;
+          }
+        }
+      }
+    } else
+#endif
+    {
+      NumericT sum = 0;
+      if (is_inclusive)
+      {
+        for(vcl_size_t i = 0; i < size1; i++)
+        {
+          sum += data_vec1[i * inc1 + start1];
+          data_vec2[i * inc2 + start2] = sum;
+        }
+      }
+      else
+      {
+        for(vcl_size_t i = 0; i < size1; i++)
+        {
+          NumericT tmp = data_vec1[i * inc1 + start1];
+          data_vec2[i * inc2 + start2] = sum;
+          sum += tmp;
+        }
+      }
+    }
+
+  }
+}
+
+/** @brief This function implements an inclusive scan on the host using OpenMP.
+*
+* Given an element vector (x_0, x_1, ..., x_{n-1}),
+* this routine computes (x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-1})
+*
+* @param vec1       Input vector: Gets overwritten by the routine.
+* @param vec2       The output vector. Either idential to vec1 or non-overlapping.
+*/
+template<typename NumericT>
+void inclusive_scan(vector_base<NumericT> const & vec1,
+                    vector_base<NumericT>       & vec2)
+{
+  detail::vector_scan_impl(vec1, vec2, true);
+}
+
+/** @brief This function implements an exclusive scan on the host using OpenMP.
+*
+* Given an element vector (x_0, x_1, ..., x_{n-1}),
+* this routine computes (0, x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-2})
+*
+* @param vec1       Input vector: Gets overwritten by the routine.
+* @param vec2       The output vector. Either idential to vec1 or non-overlapping.
+*/
+template<typename NumericT>
+void exclusive_scan(vector_base<NumericT> const & vec1,
+                    vector_base<NumericT>       & vec2)
+{
+  detail::vector_scan_impl(vec1, vec2, false);
+}
+
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
new file mode 100644
index 0000000..1038b2b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
@@ -0,0 +1,228 @@
+#ifndef VIENNACL_LINALG_ICHOL_HPP_
+#define VIENNACL_LINALG_ICHOL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ichol.hpp
+  @brief Implementations of incomplete Cholesky factorization preconditioners with static nonzero pattern.
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete Cholesky factorization with static pattern (ILU0)
+*/
+class ichol0_tag {};
+
+
+/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+  *
+  *  Refer to Chih-Jen Lin and Jorge J. Mor�, Incomplete Cholesky Factorizations with Limited Memory, SIAM J. Sci. Comput., 21(1), 24\u201345
+  *  for one of many descriptions of incomplete Cholesky Factorizations
+  *
+  *  @param A       The input matrix in CSR format
+  *  // param tag     An ichol0_tag in order to dispatch among several other preconditioners.
+  */
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> & A, ichol0_tag const & /* tag */)
+{
+  assert( (viennacl::traits::context(A).memory_type() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ICHOL0") );
+
+  NumericT           * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+  unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+  unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+  //std::cout << A.size1() << std::endl;
+  for (vcl_size_t i=0; i<A.size1(); ++i)
+  {
+    unsigned int row_i_begin = row_buffer[i];
+    unsigned int row_i_end   = row_buffer[i+1];
+
+    // get a_ii:
+    NumericT a_ii = 0;
+    for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+    {
+      if (col_buffer[buf_index_aii] == i)
+      {
+        a_ii = std::sqrt(elements[buf_index_aii]);
+        elements[buf_index_aii] = a_ii;
+        break;
+      }
+    }
+
+    // Now scale column/row i, i.e. A(k, i) /= A(i, i)
+    for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+    {
+      if (col_buffer[buf_index_aii] > i)
+        elements[buf_index_aii] /= a_ii;
+    }
+
+    // Now compute A(k, j) -= A(k, i) * A(j, i) for all nonzero k, j in column i:
+    for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j)
+    {
+      unsigned int j = col_buffer[buf_index_j];
+      if (j <= i)
+        continue;
+
+      NumericT a_ji = elements[buf_index_j];
+
+      for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k)
+      {
+        unsigned int k = col_buffer[buf_index_k];
+        if (k < j)
+          continue;
+
+        NumericT a_ki = elements[buf_index_k];
+
+        //Now check whether A(k, j) is in nonzero pattern:
+        unsigned int row_j_begin = row_buffer[j];
+        unsigned int row_j_end   = row_buffer[j+1];
+        for (unsigned int buf_index_kj = row_j_begin; buf_index_kj < row_j_end; ++buf_index_kj)
+        {
+          if (col_buffer[buf_index_kj] == k)
+          {
+            elements[buf_index_kj] -= a_ki * a_ji;
+            break;
+          }
+        }
+      }
+    }
+
+  }
+
+}
+
+
+/** @brief Incomplete Cholesky preconditioner class with static pattern (ICHOL0), can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ichol0_precond
+{
+  typedef typename MatrixT::value_type      NumericType;
+
+public:
+  ichol0_precond(MatrixT const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::context(viennacl::MAIN_MEMORY))
+  {
+      //initialize preconditioner:
+      //std::cout << "Start CPU precond" << std::endl;
+      init(mat);
+      //std::cout << "End CPU precond" << std::endl;
+  }
+
+  template<typename VectorT>
+  void apply(VectorT & vec) const
+  {
+    unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle1());
+    unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle2());
+    NumericType  const * elements   = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LLT.handle());
+
+    // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+    viennacl::linalg::host_based::detail::csr_trans_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), lower_tag());
+    viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), upper_tag());
+  }
+
+private:
+  void init(MatrixT const & mat)
+  {
+    viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(LLT, host_ctx);
+
+    viennacl::copy(mat, LLT);
+    viennacl::linalg::precondition(LLT, tag_);
+  }
+
+  ichol0_tag const & tag_;
+  viennacl::compressed_matrix<NumericType> LLT;
+};
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+*
+*  Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ichol0_precond< compressed_matrix<NumericT, AlignmentV> >
+{
+  typedef compressed_matrix<NumericT, AlignmentV>   MatrixType;
+
+public:
+  ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+  {
+    //initialize preconditioner:
+    //std::cout << "Start GPU precond" << std::endl;
+    init(mat);
+    //std::cout << "End GPU precond" << std::endl;
+  }
+
+  void apply(vector<NumericT> & vec) const
+  {
+    if (viennacl::traits::context(vec).memory_type() != viennacl::MAIN_MEMORY)
+    {
+      viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+      viennacl::context old_ctx = viennacl::traits::context(vec);
+
+      viennacl::switch_memory_context(vec, host_ctx);
+      viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+      viennacl::linalg::inplace_solve(      LLT , vec, upper_tag());
+      viennacl::switch_memory_context(vec, old_ctx);
+    }
+    else //apply ILU0 directly:
+    {
+      // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+      viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+      viennacl::linalg::inplace_solve(      LLT , vec, upper_tag());
+    }
+  }
+
+private:
+  void init(MatrixType const & mat)
+  {
+    viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+    viennacl::switch_memory_context(LLT, host_ctx);
+    LLT = mat;
+
+    viennacl::linalg::precondition(LLT, tag_);
+  }
+
+  ichol0_tag const & tag_;
+  viennacl::compressed_matrix<NumericT> LLT;
+};
+
+}
+}
+
+
+
+
+#endif
+
+
+