You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:14 UTC
[17/51] [partial] mahout git commit: (nojira) add native-viennaCL
module to codebase. closes apache/mahout#241
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
new file mode 100644
index 0000000..56e3c14
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/spgemm_vector.hpp
@@ -0,0 +1,705 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
+#define VIENNACL_LINALG_HOST_BASED_SPGEMM_VECTOR_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/sparse_matrix_operations.hpp
+ @brief Implementations of operations using sparse matrices on the CPU using a single thread or OpenMP.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/linalg/host_based/common.hpp"
+
+
+#ifdef VIENNACL_WITH_AVX2
+#include "immintrin.h"
+#endif
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+
+
+
+#ifdef VIENNACL_WITH_AVX2
+inline
+unsigned int row_C_scan_symbolic_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end,
+ int const *B_row_buffer, int const *B_col_buffer, int B_size2,
+ int *row_C_vector_output)
+{
+ __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+ __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
+
+ __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
+ __m256i avx_load_mask2 = avx_load_mask;
+
+ __m256i avx_row_indices = _mm256_set1_epi32(0);
+ avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
+
+ avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+ __m256i avx_index_front = avx_all_bsize2;
+ avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+ int *output_ptr = row_C_vector_output;
+
+ while (1)
+ {
+ // get minimum index in current front:
+ __m256i avx_index_min1 = avx_index_front;
+ __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
+
+ avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
+
+ avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
+
+ int min_index_in_front = ((int*)&avx_index_min1)[0];
+ // check for end of merge operation:
+ if (min_index_in_front == B_size2)
+ break;
+
+ // write current entry:
+ *output_ptr = min_index_in_front;
+ ++output_ptr;
+
+ // advance index front where equal to minimum index:
+ avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
+ // first part: set index to B_size2 if equal to minimum index:
+ avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
+ avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
+ // second part: increment row_start registers where minimum found:
+ avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
+ avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
+ // third part part: load new data where more entries available:
+ avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+ avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+ }
+
+ return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+#endif
+
+/** @brief Merges up to IndexNum rows from B into the result buffer.
+*
+* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1)
+**/
+template<unsigned int IndexNum>
+unsigned int row_C_scan_symbolic_vector_N(unsigned int const *row_indices_B,
+ unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
+ unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end,
+ unsigned int *row_C_vector_output)
+{
+ unsigned int index_front[IndexNum+1];
+ unsigned int const *index_front_start[IndexNum+1];
+ unsigned int const *index_front_end[IndexNum+1];
+
+ // Set up pointers for loading the indices:
+ for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
+ {
+ index_front_start[i] = B_col_buffer + B_row_buffer[*row_indices_B];
+ index_front_end[i] = B_col_buffer + B_row_buffer[*row_indices_B + 1];
+ }
+ index_front_start[IndexNum] = row_C_vector_input;
+ index_front_end[IndexNum] = row_C_vector_input_end;
+
+ // load indices:
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+ unsigned int *output_ptr = row_C_vector_output;
+
+ while (1)
+ {
+ // get minimum index in current front:
+ unsigned int min_index_in_front = B_size2;
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ min_index_in_front = std::min(min_index_in_front, index_front[i]);
+
+ if (min_index_in_front == B_size2) // we're done
+ break;
+
+ // advance index front where equal to minimum index:
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ {
+ if (index_front[i] == min_index_in_front)
+ {
+ index_front_start[i] += 1;
+ index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+ }
+ }
+
+ // write current entry:
+ *output_ptr = min_index_in_front;
+ ++output_ptr;
+ }
+
+ return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+
+struct spgemm_output_write_enabled { static void apply(unsigned int *ptr, unsigned int value) { *ptr = value; } };
+struct spgemm_output_write_disabled { static void apply(unsigned int * , unsigned int ) { } };
+
+template<typename OutputWriterT>
+unsigned int row_C_scan_symbolic_vector_1(unsigned int const *input1_begin, unsigned int const *input1_end,
+ unsigned int const *input2_begin, unsigned int const *input2_end,
+ unsigned int termination_index,
+ unsigned int *output_begin)
+{
+ unsigned int *output_ptr = output_begin;
+
+ unsigned int val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
+ unsigned int val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
+ while (1)
+ {
+ unsigned int min_index = std::min(val_1, val_2);
+
+ if (min_index == termination_index)
+ break;
+
+ if (min_index == val_1)
+ {
+ ++input1_begin;
+ val_1 = (input1_begin < input1_end) ? *input1_begin : termination_index;
+ }
+
+ if (min_index == val_2)
+ {
+ ++input2_begin;
+ val_2 = (input2_begin < input2_end) ? *input2_begin : termination_index;
+ }
+
+ // write current entry:
+ OutputWriterT::apply(output_ptr, min_index); // *output_ptr = min_index; if necessary
+ ++output_ptr;
+ }
+
+ return static_cast<unsigned int>(output_ptr - output_begin);
+}
+
+inline
+unsigned int row_C_scan_symbolic_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer,
+ unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, unsigned int B_size2,
+ unsigned int *row_C_vector_1, unsigned int *row_C_vector_2, unsigned int *row_C_vector_3)
+{
+ // Trivial case: row length 0:
+ if (row_start_A == row_end_A)
+ return 0;
+
+ // Trivial case: row length 1:
+ if (row_end_A - row_start_A == 1)
+ {
+ unsigned int A_col = A_col_buffer[row_start_A];
+ return B_row_buffer[A_col + 1] - B_row_buffer[A_col];
+ }
+
+ // Optimizations for row length 2:
+ unsigned int row_C_len = 0;
+ if (row_end_A - row_start_A == 2)
+ {
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+ return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+ B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+ B_size2,
+ row_C_vector_1);
+ }
+ else // for more than two rows we can safely merge the first two:
+ {
+#ifdef VIENNACL_WITH_AVX2
+ row_C_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
+ (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
+ (int*)row_C_vector_1);
+ row_start_A += 8;
+#else
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+ B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+ B_size2,
+ row_C_vector_1);
+ row_start_A += 2;
+#endif
+ }
+
+ // all other row lengths:
+ while (row_end_A > row_start_A)
+ {
+#ifdef VIENNACL_WITH_AVX2
+ if (row_end_A - row_start_A > 2) // we deal with one or two remaining rows more efficiently below:
+ {
+ unsigned int merged_len = row_C_scan_symbolic_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A),
+ (const int*)B_row_buffer, (const int*)B_col_buffer, int(B_size2),
+ (int*)row_C_vector_3);
+ if (row_start_A + 8 >= row_end_A)
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ else
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ row_start_A += 8;
+ }
+ else
+#endif
+ if (row_start_A == row_end_A - 1) // last merge operation. No need to write output
+ {
+ // process last row
+ unsigned int row_index_B = A_col_buffer[row_start_A];
+ return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ }
+ else if (row_start_A + 1 < row_end_A)// at least two more rows left, so merge them
+ {
+ // process single row:
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+ unsigned int merged_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[A_col_1], B_col_buffer + B_row_buffer[A_col_1 + 1],
+ B_col_buffer + B_row_buffer[A_col_2], B_col_buffer + B_row_buffer[A_col_2 + 1],
+ B_size2,
+ row_C_vector_3);
+ if (row_start_A + 2 == row_end_A) // last merge does not need a write:
+ return row_C_scan_symbolic_vector_1<spgemm_output_write_disabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ else
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(row_C_vector_3, row_C_vector_3 + merged_len,
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ row_start_A += 2;
+ }
+ else // at least two more rows left
+ {
+ // process single row:
+ unsigned int row_index_B = A_col_buffer[row_start_A];
+ row_C_len = row_C_scan_symbolic_vector_1<spgemm_output_write_enabled>(B_col_buffer + B_row_buffer[row_index_B], B_col_buffer + B_row_buffer[row_index_B + 1],
+ row_C_vector_1, row_C_vector_1 + row_C_len,
+ B_size2,
+ row_C_vector_2);
+ ++row_start_A;
+ }
+
+ std::swap(row_C_vector_1, row_C_vector_2);
+ }
+
+ return row_C_len;
+}
+
+//////////////////////////////
+
+/** @brief Merges up to IndexNum rows from B into the result buffer.
+*
+* Because the input buffer also needs to be considered, this routine actually works on an index front of length (IndexNum+1)
+**/
+template<unsigned int IndexNum, typename NumericT>
+unsigned int row_C_scan_numeric_vector_N(unsigned int const *row_indices_B, NumericT const *val_A,
+ unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
+ unsigned int const *row_C_vector_input, unsigned int const *row_C_vector_input_end, NumericT *row_C_vector_input_values,
+ unsigned int *row_C_vector_output, NumericT *row_C_vector_output_values)
+{
+ unsigned int index_front[IndexNum+1];
+ unsigned int const *index_front_start[IndexNum+1];
+ unsigned int const *index_front_end[IndexNum+1];
+ NumericT const * value_front_start[IndexNum+1];
+ NumericT values_A[IndexNum+1];
+
+ // Set up pointers for loading the indices:
+ for (unsigned int i=0; i<IndexNum; ++i, ++row_indices_B)
+ {
+ unsigned int row_B = *row_indices_B;
+
+ index_front_start[i] = B_col_buffer + B_row_buffer[row_B];
+ index_front_end[i] = B_col_buffer + B_row_buffer[row_B + 1];
+ value_front_start[i] = B_elements + B_row_buffer[row_B];
+ values_A[i] = val_A[i];
+ }
+ index_front_start[IndexNum] = row_C_vector_input;
+ index_front_end[IndexNum] = row_C_vector_input_end;
+ value_front_start[IndexNum] = row_C_vector_input_values;
+ values_A[IndexNum] = NumericT(1);
+
+ // load indices:
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+ unsigned int *output_ptr = row_C_vector_output;
+
+ while (1)
+ {
+ // get minimum index in current front:
+ unsigned int min_index_in_front = B_size2;
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ min_index_in_front = std::min(min_index_in_front, index_front[i]);
+
+ if (min_index_in_front == B_size2) // we're done
+ break;
+
+ // advance index front where equal to minimum index:
+ NumericT row_C_value = 0;
+ for (unsigned int i=0; i<=IndexNum; ++i)
+ {
+ if (index_front[i] == min_index_in_front)
+ {
+ index_front_start[i] += 1;
+ index_front[i] = (index_front_start[i] < index_front_end[i]) ? *index_front_start[i] : B_size2;
+
+ row_C_value += values_A[i] * *value_front_start[i];
+ value_front_start[i] += 1;
+ }
+ }
+
+ // write current entry:
+ *output_ptr = min_index_in_front;
+ ++output_ptr;
+ *row_C_vector_output_values = row_C_value;
+ ++row_C_vector_output_values;
+ }
+
+ return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+
+
+
+#ifdef VIENNACL_WITH_AVX2
+inline
+unsigned int row_C_scan_numeric_vector_AVX2(int const *row_indices_B_begin, int const *row_indices_B_end, double const *values_A,
+ int const *B_row_buffer, int const *B_col_buffer, double const *B_elements,
+ int B_size2,
+ int *row_C_vector_output, double *row_C_vector_output_values)
+{
+ __m256i avx_all_ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+ __m256i avx_all_bsize2 = _mm256_set_epi32(B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2, B_size2);
+
+ __m256i avx_row_indices_offsets = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+ __m256i avx_load_mask = _mm256_sub_epi32(avx_row_indices_offsets, _mm256_set1_epi32(row_indices_B_end - row_indices_B_begin));
+ __m256i avx_load_mask2 = avx_load_mask;
+
+ __m256i avx_row_indices = _mm256_set1_epi32(0);
+ avx_row_indices = _mm256_mask_i32gather_epi32(avx_row_indices, row_indices_B_begin, avx_row_indices_offsets, avx_load_mask, 4);
+
+ // load values from A:
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256d avx_value_A_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+ values_A, //base ptr
+ _mm256_extractf128_si256(avx_row_indices_offsets, 0), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256d avx_value_A_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+ values_A, //base ptr
+ _mm256_extractf128_si256(avx_row_indices_offsets, 1), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256i avx_row_start = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer, avx_row_indices, avx_load_mask, 4);
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256i avx_row_end = _mm256_mask_i32gather_epi32(avx_all_ones, B_row_buffer+1, avx_row_indices, avx_load_mask, 4);
+
+ avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+ avx_load_mask2 = avx_load_mask;
+ __m256i avx_index_front = avx_all_bsize2;
+ avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+ // load front values from B:
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256d avx_value_front_low = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+ B_elements, //base ptr
+ _mm256_extractf128_si256(avx_row_start, 0), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ __m256d avx_value_front_high = _mm256_mask_i32gather_pd(_mm256_set_pd(0, 0, 0, 0), //src
+ B_elements, //base ptr
+ _mm256_extractf128_si256(avx_row_start, 1), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+ int *output_ptr = row_C_vector_output;
+
+ while (1)
+ {
+ // get minimum index in current front:
+ __m256i avx_index_min1 = avx_index_front;
+ __m256i avx_temp = _mm256_permutevar8x32_epi32(avx_index_min1, _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4));
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first four elements compared against last four elements
+
+ avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(78)); // 0b01001110 = 78, using shuffle instead of permutevar here because of lower latency
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // first two elements compared against elements three and four (same for upper half of register)
+
+ avx_temp = _mm256_shuffle_epi32(avx_index_min1, int(177)); // 0b10110001 = 177, using shuffle instead of permutevar here because of lower latency
+ avx_index_min1 = _mm256_min_epi32(avx_index_min1, avx_temp); // now all entries of avx_index_min1 hold the minimum
+
+ int min_index_in_front = ((int*)&avx_index_min1)[0];
+ // check for end of merge operation:
+ if (min_index_in_front == B_size2)
+ break;
+
+ // accumulate value (can certainly be done more elegantly...)
+ double value = 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[0]) ? ((double*)&avx_value_front_low)[0] * ((double*)&avx_value_A_low)[0] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[1]) ? ((double*)&avx_value_front_low)[1] * ((double*)&avx_value_A_low)[1] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[2]) ? ((double*)&avx_value_front_low)[2] * ((double*)&avx_value_A_low)[2] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[3]) ? ((double*)&avx_value_front_low)[3] * ((double*)&avx_value_A_low)[3] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[4]) ? ((double*)&avx_value_front_high)[0] * ((double*)&avx_value_A_high)[0] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[5]) ? ((double*)&avx_value_front_high)[1] * ((double*)&avx_value_A_high)[1] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[6]) ? ((double*)&avx_value_front_high)[2] * ((double*)&avx_value_A_high)[2] : 0;
+ value += (min_index_in_front == ((int*)&avx_index_front)[7]) ? ((double*)&avx_value_front_high)[3] * ((double*)&avx_value_A_high)[3] : 0;
+ *row_C_vector_output_values = value;
+ ++row_C_vector_output_values;
+
+ // write current entry:
+ *output_ptr = min_index_in_front;
+ ++output_ptr;
+
+ // advance index front where equal to minimum index:
+ avx_load_mask = _mm256_cmpeq_epi32(avx_index_front, avx_index_min1);
+ // first part: set index to B_size2 if equal to minimum index:
+ avx_temp = _mm256_and_si256(avx_all_bsize2, avx_load_mask);
+ avx_index_front = _mm256_max_epi32(avx_index_front, avx_temp);
+ // second part: increment row_start registers where minimum found:
+ avx_temp = _mm256_and_si256(avx_all_ones, avx_load_mask); //ones only where the minimum was found
+ avx_row_start = _mm256_add_epi32(avx_row_start, avx_temp);
+ // third part part: load new data where more entries available:
+ avx_load_mask = _mm256_cmpgt_epi32(avx_row_end, avx_row_start);
+ avx_load_mask2 = avx_load_mask;
+ avx_index_front = _mm256_mask_i32gather_epi32(avx_index_front, B_col_buffer, avx_row_start, avx_load_mask, 4);
+
+ // load new values where necessary:
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ avx_value_front_low = _mm256_mask_i32gather_pd(avx_value_front_low, //src
+ B_elements, //base ptr
+ _mm256_extractf128_si256(avx_row_start, 0), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), 8); // mask
+
+ avx_load_mask = avx_load_mask2; // reload mask (destroyed by gather)
+ avx_value_front_high = _mm256_mask_i32gather_pd(avx_value_front_high, //src
+ B_elements, //base ptr
+ _mm256_extractf128_si256(avx_row_start, 1), //indices
+ _mm256_permutevar8x32_epi32(avx_load_mask, _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0)), 8); // mask
+
+ //multiply new entries:
+
+ }
+
+ return static_cast<unsigned int>(output_ptr - row_C_vector_output);
+}
+#endif
+
+
+template<typename NumericT>
+unsigned int row_C_scan_numeric_vector_1(unsigned int const *input1_index_begin, unsigned int const *input1_index_end, NumericT const *input1_values_begin, NumericT factor1,
+ unsigned int const *input2_index_begin, unsigned int const *input2_index_end, NumericT const *input2_values_begin, NumericT factor2,
+ unsigned int termination_index,
+ unsigned int *output_index_begin, NumericT *output_values_begin)
+{
+ unsigned int *output_ptr = output_index_begin;
+
+ unsigned int index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
+ unsigned int index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
+
+ while (1)
+ {
+ unsigned int min_index = std::min(index1, index2);
+ NumericT value = 0;
+
+ if (min_index == termination_index)
+ break;
+
+ if (min_index == index1)
+ {
+ ++input1_index_begin;
+ index1 = (input1_index_begin < input1_index_end) ? *input1_index_begin : termination_index;
+
+ value += factor1 * *input1_values_begin;
+ ++input1_values_begin;
+ }
+
+ if (min_index == index2)
+ {
+ ++input2_index_begin;
+ index2 = (input2_index_begin < input2_index_end) ? *input2_index_begin : termination_index;
+
+ value += factor2 * *input2_values_begin;
+ ++input2_values_begin;
+ }
+
+ // write current entry:
+ *output_ptr = min_index;
+ ++output_ptr;
+ *output_values_begin = value;
+ ++output_values_begin;
+ }
+
+ return static_cast<unsigned int>(output_ptr - output_index_begin);
+}
+
+template<typename NumericT>
+void row_C_scan_numeric_vector(unsigned int row_start_A, unsigned int row_end_A, unsigned int const *A_col_buffer, NumericT const *A_elements,
+ unsigned int const *B_row_buffer, unsigned int const *B_col_buffer, NumericT const *B_elements, unsigned int B_size2,
+ unsigned int row_start_C, unsigned int row_end_C, unsigned int *C_col_buffer, NumericT *C_elements,
+ unsigned int *row_C_vector_1, NumericT *row_C_vector_1_values,
+ unsigned int *row_C_vector_2, NumericT *row_C_vector_2_values,
+ unsigned int *row_C_vector_3, NumericT *row_C_vector_3_values)
+{
+ (void)row_end_C;
+
+ // Trivial case: row length 0:
+ if (row_start_A == row_end_A)
+ return;
+
+ // Trivial case: row length 1:
+ if (row_end_A - row_start_A == 1)
+ {
+ unsigned int A_col = A_col_buffer[row_start_A];
+ unsigned int B_end = B_row_buffer[A_col + 1];
+ NumericT A_value = A_elements[row_start_A];
+ C_col_buffer += row_start_C;
+ C_elements += row_start_C;
+ for (unsigned int j = B_row_buffer[A_col]; j < B_end; ++j, ++C_col_buffer, ++C_elements)
+ {
+ *C_col_buffer = B_col_buffer[j];
+ *C_elements = A_value * B_elements[j];
+ }
+ return;
+ }
+
+ unsigned int row_C_len = 0;
+ if (row_end_A - row_start_A == 2) // directly merge to C:
+ {
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+ unsigned int B_offset_1 = B_row_buffer[A_col_1];
+ unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+ row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+ B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+ B_size2,
+ C_col_buffer + row_start_C, C_elements + row_start_C);
+ return;
+ }
+#ifdef VIENNACL_WITH_AVX2
+ else if (row_end_A - row_start_A > 10) // safely merge eight rows into temporary buffer:
+ {
+ row_C_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
+ (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
+ (int*)row_C_vector_1, row_C_vector_1_values);
+ row_start_A += 8;
+ }
+#endif
+ else // safely merge two rows into temporary buffer:
+ {
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+ unsigned int B_offset_1 = B_row_buffer[A_col_1];
+ unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+ row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+ B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+ B_size2,
+ row_C_vector_1, row_C_vector_1_values);
+ row_start_A += 2;
+ }
+
+ // process remaining rows:
+ while (row_end_A > row_start_A)
+ {
+#ifdef VIENNACL_WITH_AVX2
+ if (row_end_A - row_start_A > 9) // code in other if-conditionals ensures that values get written to C
+ {
+ unsigned int merged_len = row_C_scan_numeric_vector_AVX2((const int*)(A_col_buffer + row_start_A), (const int*)(A_col_buffer + row_end_A), A_elements + row_start_A,
+ (const int*)B_row_buffer, (const int*)B_col_buffer, B_elements, int(B_size2),
+ (int*)row_C_vector_3, row_C_vector_3_values);
+ row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
+ row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+ B_size2,
+ row_C_vector_2, row_C_vector_2_values);
+ row_start_A += 8;
+ }
+ else
+#endif
+ if (row_start_A + 1 == row_end_A) // last row to merge, write directly to C:
+ {
+ unsigned int A_col = A_col_buffer[row_start_A];
+ unsigned int B_offset = B_row_buffer[A_col];
+
+ row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
+ row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+ B_size2,
+ C_col_buffer + row_start_C, C_elements + row_start_C);
+ return;
+ }
+ else if (row_start_A + 2 < row_end_A)// at least three more rows left, so merge two
+ {
+ // process single row:
+ unsigned int A_col_1 = A_col_buffer[row_start_A];
+ unsigned int A_col_2 = A_col_buffer[row_start_A + 1];
+
+ unsigned int B_offset_1 = B_row_buffer[A_col_1];
+ unsigned int B_offset_2 = B_row_buffer[A_col_2];
+
+ unsigned int merged_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset_1, B_col_buffer + B_row_buffer[A_col_1+1], B_elements + B_offset_1, A_elements[row_start_A],
+ B_col_buffer + B_offset_2, B_col_buffer + B_row_buffer[A_col_2+1], B_elements + B_offset_2, A_elements[row_start_A + 1],
+ B_size2,
+ row_C_vector_3, row_C_vector_3_values);
+ row_C_len = row_C_scan_numeric_vector_1(row_C_vector_3, row_C_vector_3 + merged_len, row_C_vector_3_values, NumericT(1.0),
+ row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+ B_size2,
+ row_C_vector_2, row_C_vector_2_values);
+ row_start_A += 2;
+ }
+ else
+ {
+ unsigned int A_col = A_col_buffer[row_start_A];
+ unsigned int B_offset = B_row_buffer[A_col];
+
+ row_C_len = row_C_scan_numeric_vector_1(B_col_buffer + B_offset, B_col_buffer + B_row_buffer[A_col+1], B_elements + B_offset, A_elements[row_start_A],
+ row_C_vector_1, row_C_vector_1 + row_C_len, row_C_vector_1_values, NumericT(1.0),
+ B_size2,
+ row_C_vector_2, row_C_vector_2_values);
+ ++row_start_A;
+ }
+
+ std::swap(row_C_vector_1, row_C_vector_2);
+ std::swap(row_C_vector_1_values, row_C_vector_2_values);
+ }
+}
+
+
+} // namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
new file mode 100644
index 0000000..b4944a2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/vector_operations.hpp
@@ -0,0 +1,1188 @@
+#ifndef VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_HOST_BASED_VECTOR_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/host_based/vector_operations.hpp
+ @brief Implementations of vector operations using a plain single-threaded or OpenMP-enabled execution on CPU
+*/
+
+#include <cmath>
+#include <algorithm> //for std::max and std::min
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/linalg/host_based/common.hpp"
+#include "viennacl/linalg/detail/op_applier.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#ifdef VIENNACL_WITH_OPENMP
+#include <omp.h>
+#endif
+
+// Minimum vector size for using OpenMP on vector operations:
+#ifndef VIENNACL_OPENMP_VECTOR_MIN_SIZE
+ #define VIENNACL_OPENMP_VECTOR_MIN_SIZE 5000
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace host_based
+{
+namespace detail
+{
+ template<typename NumericT>
+ NumericT flip_sign(NumericT val) { return -val; }
+ inline unsigned long flip_sign(unsigned long val) { return val; }
+ inline unsigned int flip_sign(unsigned int val) { return val; }
+ inline unsigned short flip_sign(unsigned short val) { return val; }
+ inline unsigned char flip_sign(unsigned char val) { return val; }
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
+//
+template<typename DestNumericT, typename SrcNumericT>
+void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src)
+{
+ DestNumericT * data_dest = detail::extract_raw_pointer<DestNumericT>(dest);
+ SrcNumericT const * data_src = detail::extract_raw_pointer<SrcNumericT>(src);
+
+ vcl_size_t start_dest = viennacl::traits::start(dest);
+ vcl_size_t inc_dest = viennacl::traits::stride(dest);
+ vcl_size_t size_dest = viennacl::traits::size(dest);
+
+ vcl_size_t start_src = viennacl::traits::start(src);
+ vcl_size_t inc_src = viennacl::traits::stride(src);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size_dest > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size_dest); ++i)
+ data_dest[static_cast<vcl_size_t>(i)*inc_dest+start_dest] = static_cast<DestNumericT>(data_src[static_cast<vcl_size_t>(i)*inc_src+start_src]);
+}
+
+template<typename NumericT, typename ScalarT1>
+void av(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = detail::flip_sign(data_alpha);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ if (reciprocal_alpha)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha;
+ }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /* len_alpha */, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t /* len_beta */, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+ value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = detail::flip_sign(data_alpha);
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = detail::flip_sign(data_beta);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ vcl_size_t start3 = viennacl::traits::start(vec3);
+ vcl_size_t inc3 = viennacl::traits::stride(vec3);
+
+ if (reciprocal_alpha)
+ {
+ if (reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+ }
+ }
+ else
+ {
+ if (reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+ }
+ }
+}
+
+
+template<typename NumericT, typename ScalarT1, typename ScalarT2>
+void avbv_v(vector_base<NumericT> & vec1,
+ vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha,
+ vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+ value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(vec3);
+
+ value_type data_alpha = alpha;
+ if (flip_sign_alpha)
+ data_alpha = detail::flip_sign(data_alpha);
+
+ value_type data_beta = beta;
+ if (flip_sign_beta)
+ data_beta = detail::flip_sign(data_beta);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ vcl_size_t start3 = viennacl::traits::start(vec3);
+ vcl_size_t inc3 = viennacl::traits::stride(vec3);
+
+ if (reciprocal_alpha)
+ {
+ if (reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] / data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+ }
+ }
+ else
+ {
+ if (reciprocal_beta)
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] / data_beta;
+ }
+ else
+ {
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] += data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] * data_alpha + data_vec3[static_cast<vcl_size_t>(i)*inc3+start3] * data_beta;
+ }
+ }
+}
+
+
+
+
+/** @brief Assign a constant value to a vector (-range/-slice)
+*
+* @param vec1 The vector to which the value should be assigned
+* @param alpha The value to be assigned
+* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer).
+*/
+template<typename NumericT>
+void vector_assign(vector_base<NumericT> & vec1, const NumericT & alpha, bool up_to_internal_size = false)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+ vcl_size_t loop_bound = up_to_internal_size ? vec1.internal_size() : size1; //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
+
+ value_type data_alpha = static_cast<value_type>(alpha);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (loop_bound > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(loop_bound); ++i)
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha;
+}
+
+
+/** @brief Swaps the contents of two vectors, data is copied
+*
+* @param vec1 The first vector (or -range, or -slice)
+* @param vec2 The second vector (or -range, or -slice)
+*/
+template<typename NumericT>
+void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ {
+ value_type temp = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
+ data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = temp;
+ }
+}
+
+
+///////////////////////// Elementwise operations /////////////
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax)
+*
+* @param vec1 The result vector (or -range, or -slice)
+* @param proxy The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
+{
+ typedef NumericT value_type;
+ typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> > OpFunctor;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+ value_type const * data_vec3 = detail::extract_raw_pointer<value_type>(proxy.rhs());
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+ vcl_size_t inc2 = viennacl::traits::stride(proxy.lhs());
+
+ vcl_size_t start3 = viennacl::traits::start(proxy.rhs());
+ vcl_size_t inc3 = viennacl::traits::stride(proxy.rhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2], data_vec3[static_cast<vcl_size_t>(i)*inc3+start3]);
+}
+
+/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax)
+*
+* @param vec1 The result vector (or -range, or -slice)
+* @param proxy The proxy object holding v2, v3 and the operation
+*/
+template<typename NumericT, typename OpT>
+void element_op(vector_base<NumericT> & vec1,
+ vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<OpT> > const & proxy)
+{
+ typedef NumericT value_type;
+ typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> > OpFunctor;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(proxy.lhs());
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(proxy.lhs());
+ vcl_size_t inc2 = viennacl::traits::stride(proxy.lhs());
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ OpFunctor::apply(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1], data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]);
+}
+
+
+///////////////////////// Norms and inner product ///////////////////
+
+
+//implementation of inner product:
+
+namespace detail
+{
+
+// the following circumvents problems when trying to use a variable of template parameter type for a reduction.
+// Such a behavior is not covered by the OpenMP standard, hence we manually apply some preprocessor magic to resolve the problem.
+// See https://github.com/viennacl/viennacl-dev/issues/112 for a detailed explanation and discussion.
+
+#define VIENNACL_INNER_PROD_IMPL_1(RESULTSCALART, TEMPSCALART) \
+ inline RESULTSCALART inner_prod_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1, \
+ RESULTSCALART const * data_vec2, vcl_size_t start2, vcl_size_t inc2) { \
+ TEMPSCALART temp = 0;
+
+#define VIENNACL_INNER_PROD_IMPL_2(RESULTSCALART) \
+ for (long i = 0; i < static_cast<long>(size1); ++i) \
+ temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] * data_vec2[static_cast<vcl_size_t>(i)*inc2+start2]; \
+ return static_cast<RESULTSCALART>(temp); \
+ }
+
+// char
+VIENNACL_INNER_PROD_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(char)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned char)
+
+
+// short
+VIENNACL_INNER_PROD_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(short)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned short)
+
+
+// int
+VIENNACL_INNER_PROD_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(int)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned int)
+
+
+// long
+VIENNACL_INNER_PROD_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(long)
+
+VIENNACL_INNER_PROD_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(unsigned long)
+
+
+// float
+VIENNACL_INNER_PROD_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(float)
+
+// double
+VIENNACL_INNER_PROD_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_INNER_PROD_IMPL_2(double)
+
+#undef VIENNACL_INNER_PROD_IMPL_1
+#undef VIENNACL_INNER_PROD_IMPL_2
+}
+
+/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2).
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param result The result scalar (on the gpu)
+*/
+template<typename NumericT, typename ScalarT>
+void inner_prod_impl(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> const & vec2,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type const * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ result = detail::inner_prod_impl(data_vec1, start1, inc1, size1,
+ data_vec2, start2, inc2); //Note: Assignment to result might be expensive, thus a temporary is introduced here
+}
+
+template<typename NumericT>
+void inner_prod_impl(vector_base<NumericT> const & x,
+ vector_tuple<NumericT> const & vec_tuple,
+ vector_base<NumericT> & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_x = detail::extract_raw_pointer<value_type>(x);
+
+ vcl_size_t start_x = viennacl::traits::start(x);
+ vcl_size_t inc_x = viennacl::traits::stride(x);
+ vcl_size_t size_x = viennacl::traits::size(x);
+
+ std::vector<value_type> temp(vec_tuple.const_size());
+ std::vector<value_type const *> data_y(vec_tuple.const_size());
+ std::vector<vcl_size_t> start_y(vec_tuple.const_size());
+ std::vector<vcl_size_t> stride_y(vec_tuple.const_size());
+
+ for (vcl_size_t j=0; j<vec_tuple.const_size(); ++j)
+ {
+ data_y[j] = detail::extract_raw_pointer<value_type>(vec_tuple.const_at(j));
+ start_y[j] = viennacl::traits::start(vec_tuple.const_at(j));
+ stride_y[j] = viennacl::traits::stride(vec_tuple.const_at(j));
+ }
+
+ // Note: No OpenMP here because it cannot perform a reduction on temp-array. Savings in memory bandwidth are expected to still justify this approach...
+ for (vcl_size_t i = 0; i < size_x; ++i)
+ {
+ value_type entry_x = data_x[i*inc_x+start_x];
+ for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+ temp[j] += entry_x * data_y[j][i*stride_y[j]+start_y[j]];
+ }
+
+ for (vcl_size_t j=0; j < vec_tuple.const_size(); ++j)
+ result[j] = temp[j]; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+
+namespace detail
+{
+
+#define VIENNACL_NORM_1_IMPL_1(RESULTSCALART, TEMPSCALART) \
+ inline RESULTSCALART norm_1_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \
+ TEMPSCALART temp = 0;
+
+#define VIENNACL_NORM_1_IMPL_2(RESULTSCALART, TEMPSCALART) \
+ for (long i = 0; i < static_cast<long>(size1); ++i) \
+ temp += static_cast<TEMPSCALART>(std::fabs(static_cast<double>(data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]))); \
+ return static_cast<RESULTSCALART>(temp); \
+ }
+
+// char
+VIENNACL_NORM_1_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(char, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned char, int)
+
+// short
+VIENNACL_NORM_1_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(short, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned short, int)
+
+
+// int
+VIENNACL_NORM_1_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(int, int)
+
+VIENNACL_NORM_1_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned int, unsigned int)
+
+
+// long
+VIENNACL_NORM_1_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(long, long)
+
+VIENNACL_NORM_1_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(unsigned long, unsigned long)
+
+
+// float
+VIENNACL_NORM_1_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(float, float)
+
+// double
+VIENNACL_NORM_1_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_1_IMPL_2(double, double)
+
+#undef VIENNACL_NORM_1_IMPL_1
+#undef VIENNACL_NORM_1_IMPL_2
+
+}
+
+/** @brief Computes the l^1-norm of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_1_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ result = detail::norm_1_impl(data_vec1, start1, inc1, size1); //Note: Assignment to result might be expensive, thus using a temporary for accumulation
+}
+
+
+
+namespace detail
+{
+
+#define VIENNACL_NORM_2_IMPL_1(RESULTSCALART, TEMPSCALART) \
+ inline RESULTSCALART norm_2_impl(RESULTSCALART const * data_vec1, vcl_size_t start1, vcl_size_t inc1, vcl_size_t size1) { \
+ TEMPSCALART temp = 0;
+
+#define VIENNACL_NORM_2_IMPL_2(RESULTSCALART, TEMPSCALART) \
+ for (long i = 0; i < static_cast<long>(size1); ++i) { \
+ RESULTSCALART data = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1]; \
+ temp += static_cast<TEMPSCALART>(data * data); \
+ } \
+ return static_cast<RESULTSCALART>(temp); \
+ }
+
+// char
+VIENNACL_NORM_2_IMPL_1(char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(char, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned char, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned char, int)
+
+
+// short
+VIENNACL_NORM_2_IMPL_1(short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(short, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned short, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned short, int)
+
+
+// int
+VIENNACL_NORM_2_IMPL_1(int, int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(int, int)
+
+VIENNACL_NORM_2_IMPL_1(unsigned int, unsigned int)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned int, unsigned int)
+
+
+// long
+VIENNACL_NORM_2_IMPL_1(long, long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(long, long)
+
+VIENNACL_NORM_2_IMPL_1(unsigned long, unsigned long)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(unsigned long, unsigned long)
+
+
+// float
+VIENNACL_NORM_2_IMPL_1(float, float)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(float, float)
+
+// double
+VIENNACL_NORM_2_IMPL_1(double, double)
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+: temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+VIENNACL_NORM_2_IMPL_2(double, double)
+
+#undef VIENNACL_NORM_2_IMPL_1
+#undef VIENNACL_NORM_2_IMPL_2
+
+}
+
+
+/** @brief Computes the l^2-norm of a vector - implementation
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_2_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ result = std::sqrt(detail::norm_2_impl(data_vec1, start1, inc1, size1)); //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the supremum-norm of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void norm_inf_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t thread_count=1;
+
+ #ifdef VIENNACL_WITH_OPENMP
+ if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+ #endif
+
+ std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ id = omp_get_thread_num();
+#endif
+
+ vcl_size_t begin = (size1 * id) / thread_count;
+ vcl_size_t end = (size1 * (id + 1)) / thread_count;
+ temp[id] = 0;
+
+ for (vcl_size_t i = begin; i < end; ++i)
+ temp[id] = std::max<value_type>(temp[id], static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1])))); //casting to double in order to avoid problems if T is an integer type
+ }
+ for (vcl_size_t i = 1; i < thread_count; ++i)
+ temp[0] = std::max<value_type>( temp[0], temp[i]);
+ result = temp[0];
+}
+
+//This function should return a CPU scalar, otherwise statements like
+// vcl_rhs[index_norm_inf(vcl_rhs)]
+// are ambiguous
+/** @brief Computes the index of the first entry that is equal to the supremum-norm in modulus.
+*
+* @param vec1 The vector
+* @return The result. Note that the result must be a CPU scalar (unsigned int), since gpu scalars are floating point types.
+*/
+template<typename NumericT>
+vcl_size_t index_norm_inf(vector_base<NumericT> const & vec1)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+ vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+ if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+
+ std::vector<value_type> temp(thread_count);
+ std::vector<vcl_size_t> index(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ id = omp_get_thread_num();
+#endif
+ vcl_size_t begin = (size1 * id) / thread_count;
+ vcl_size_t end = (size1 * (id + 1)) / thread_count;
+ index[id] = start1;
+ temp[id] = 0;
+ value_type data;
+
+ for (vcl_size_t i = begin; i < end; ++i)
+ {
+ data = static_cast<value_type>(std::fabs(static_cast<double>(data_vec1[i*inc1+start1]))); //casting to double in order to avoid problems if T is an integer type
+ if (data > temp[id])
+ {
+ index[id] = i;
+ temp[id] = data;
+ }
+ }
+ }
+ for (vcl_size_t i = 1; i < thread_count; ++i)
+ {
+ if (temp[i] > temp[0])
+ {
+ index[0] = index[i];
+ temp[0] = temp[i];
+ }
+ }
+ return index[0];
+}
+
+/** @brief Computes the maximum of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void max_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+ if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+
+ std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ id = omp_get_thread_num();
+#endif
+ vcl_size_t begin = (size1 * id) / thread_count;
+ vcl_size_t end = (size1 * (id + 1)) / thread_count;
+ temp[id] = data_vec1[start1];
+
+ for (vcl_size_t i = begin; i < end; ++i)
+ {
+ value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function
+ temp[id] = std::max<value_type>(temp[id],v);
+ }
+ }
+ for (vcl_size_t i = 1; i < thread_count; ++i)
+ temp[0] = std::max<value_type>( temp[0], temp[i]);
+ result = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the minimum of a vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void min_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t thread_count=1;
+
+#ifdef VIENNACL_WITH_OPENMP
+ if(size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ thread_count = omp_get_max_threads();
+#endif
+
+ std::vector<value_type> temp(thread_count);
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ {
+ vcl_size_t id = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ id = omp_get_thread_num();
+#endif
+ vcl_size_t begin = (size1 * id) / thread_count;
+ vcl_size_t end = (size1 * (id + 1)) / thread_count;
+ temp[id] = data_vec1[start1];
+
+ for (vcl_size_t i = begin; i < end; ++i)
+ {
+ value_type v = data_vec1[i*inc1+start1];//Note: Assignment to 'vec1' in std::min might be expensive, thus 'v' is used for the function
+ temp[id] = std::min<value_type>(temp[id],v);
+ }
+ }
+ for (vcl_size_t i = 1; i < thread_count; ++i)
+ temp[0] = std::min<value_type>( temp[0], temp[i]);
+ result = temp[0];//Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes the sum of all elements from the vector
+*
+* @param vec1 The vector
+* @param result The result scalar
+*/
+template<typename NumericT, typename ScalarT>
+void sum_impl(vector_base<NumericT> const & vec1,
+ ScalarT & result)
+{
+ typedef NumericT value_type;
+
+ value_type const * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ value_type temp = 0;
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for reduction(+:temp) if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ temp += data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+
+ result = temp; //Note: Assignment to result might be expensive, thus 'temp' is used for accumulation
+}
+
+/** @brief Computes a plane rotation of two vectors.
+*
+* Computes (x,y) <- (alpha * x + beta * y, -beta * x + alpha * y)
+*
+* @param vec1 The first vector
+* @param vec2 The second vector
+* @param alpha The first transformation coefficient
+* @param beta The second transformation coefficient
+*/
+template<typename NumericT>
+void plane_rotation(vector_base<NumericT> & vec1,
+ vector_base<NumericT> & vec2,
+ NumericT alpha, NumericT beta)
+{
+ typedef NumericT value_type;
+
+ value_type * data_vec1 = detail::extract_raw_pointer<value_type>(vec1);
+ value_type * data_vec2 = detail::extract_raw_pointer<value_type>(vec2);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+ value_type data_alpha = alpha;
+ value_type data_beta = beta;
+
+#ifdef VIENNACL_WITH_OPENMP
+ #pragma omp parallel for if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+#endif
+ for (long i = 0; i < static_cast<long>(size1); ++i)
+ {
+ value_type temp1 = data_vec1[static_cast<vcl_size_t>(i)*inc1+start1];
+ value_type temp2 = data_vec2[static_cast<vcl_size_t>(i)*inc2+start2];
+
+ data_vec1[static_cast<vcl_size_t>(i)*inc1+start1] = data_alpha * temp1 + data_beta * temp2;
+ data_vec2[static_cast<vcl_size_t>(i)*inc2+start2] = data_alpha * temp2 - data_beta * temp1;
+ }
+}
+
+namespace detail
+{
+ /** @brief Implementation of inclusive_scan and exclusive_scan for the host (OpenMP) backend. */
+ template<typename NumericT>
+ void vector_scan_impl(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> & vec2,
+ bool is_inclusive)
+ {
+ NumericT const * data_vec1 = detail::extract_raw_pointer<NumericT>(vec1);
+ NumericT * data_vec2 = detail::extract_raw_pointer<NumericT>(vec2);
+
+ vcl_size_t start1 = viennacl::traits::start(vec1);
+ vcl_size_t inc1 = viennacl::traits::stride(vec1);
+ vcl_size_t size1 = viennacl::traits::size(vec1);
+ if (size1 < 1)
+ return;
+
+ vcl_size_t start2 = viennacl::traits::start(vec2);
+ vcl_size_t inc2 = viennacl::traits::stride(vec2);
+
+#ifdef VIENNACL_WITH_OPENMP
+ if (size1 > VIENNACL_OPENMP_VECTOR_MIN_SIZE)
+ {
+ std::vector<NumericT> thread_results(omp_get_max_threads());
+
+ // inclusive scan each thread segment:
+ #pragma omp parallel
+ {
+ vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
+ vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
+ vcl_size_t thread_stop = std::min<vcl_size_t>(thread_start + work_per_thread, size1);
+
+ NumericT thread_sum = 0;
+ for(vcl_size_t i = thread_start; i < thread_stop; i++)
+ thread_sum += data_vec1[i * inc1 + start1];
+
+ thread_results[omp_get_thread_num()] = thread_sum;
+ }
+
+ // exclusive-scan of thread results:
+ NumericT current_offset = 0;
+ for (vcl_size_t i=0; i<thread_results.size(); ++i)
+ {
+ NumericT tmp = thread_results[i];
+ thread_results[i] = current_offset;
+ current_offset += tmp;
+ }
+
+ // exclusive/inclusive scan of each segment with correct offset:
+ #pragma omp parallel
+ {
+ vcl_size_t work_per_thread = (size1 - 1) / thread_results.size() + 1;
+ vcl_size_t thread_start = work_per_thread * omp_get_thread_num();
+ vcl_size_t thread_stop = std::min<vcl_size_t>(thread_start + work_per_thread, size1);
+
+ NumericT thread_sum = thread_results[omp_get_thread_num()];
+ if (is_inclusive)
+ {
+ for(vcl_size_t i = thread_start; i < thread_stop; i++)
+ {
+ thread_sum += data_vec1[i * inc1 + start1];
+ data_vec2[i * inc2 + start2] = thread_sum;
+ }
+ }
+ else
+ {
+ for(vcl_size_t i = thread_start; i < thread_stop; i++)
+ {
+ NumericT tmp = data_vec1[i * inc1 + start1];
+ data_vec2[i * inc2 + start2] = thread_sum;
+ thread_sum += tmp;
+ }
+ }
+ }
+ } else
+#endif
+ {
+ NumericT sum = 0;
+ if (is_inclusive)
+ {
+ for(vcl_size_t i = 0; i < size1; i++)
+ {
+ sum += data_vec1[i * inc1 + start1];
+ data_vec2[i * inc2 + start2] = sum;
+ }
+ }
+ else
+ {
+ for(vcl_size_t i = 0; i < size1; i++)
+ {
+ NumericT tmp = data_vec1[i * inc1 + start1];
+ data_vec2[i * inc2 + start2] = sum;
+ sum += tmp;
+ }
+ }
+ }
+
+ }
+}
+
+/** @brief This function implements an inclusive scan on the host using OpenMP.
+*
+* Given an element vector (x_0, x_1, ..., x_{n-1}),
+* this routine computes (x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-1})
+*
+* @param vec1 Input vector: Gets overwritten by the routine.
+* @param vec2 The output vector. Either idential to vec1 or non-overlapping.
+*/
+template<typename NumericT>
+void inclusive_scan(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> & vec2)
+{
+ detail::vector_scan_impl(vec1, vec2, true);
+}
+
+/** @brief This function implements an exclusive scan on the host using OpenMP.
+*
+* Given an element vector (x_0, x_1, ..., x_{n-1}),
+* this routine computes (0, x_0, x_0 + x_1, ..., x_0 + x_1 + ... + x_{n-2})
+*
+* @param vec1 Input vector: Gets overwritten by the routine.
+* @param vec2 The output vector. Either idential to vec1 or non-overlapping.
+*/
+template<typename NumericT>
+void exclusive_scan(vector_base<NumericT> const & vec1,
+ vector_base<NumericT> & vec2)
+{
+ detail::vector_scan_impl(vec1, vec2, false);
+}
+
+
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
new file mode 100644
index 0000000..1038b2b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ichol.hpp
@@ -0,0 +1,228 @@
+#ifndef VIENNACL_LINALG_ICHOL_HPP_
+#define VIENNACL_LINALG_ICHOL_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head: Karl Rupp rupp@iue.tuwien.ac.at
+
+ (A list of authors and contributors can be found in the manual)
+
+ License: MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/linalg/ichol.hpp
+ @brief Implementations of incomplete Cholesky factorization preconditioners with static nonzero pattern.
+*/
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/compressed_matrix.hpp"
+
+#include "viennacl/linalg/host_based/common.hpp"
+
+#include <map>
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/** @brief A tag for incomplete Cholesky factorization with static pattern (ILU0)
+*/
+class ichol0_tag {};
+
+
+/** @brief Implementation of a ILU-preconditioner with static pattern. Optimized version for CSR matrices.
+ *
+ * Refer to Chih-Jen Lin and Jorge J. Mor�, Incomplete Cholesky Factorizations with Limited Memory, SIAM J. Sci. Comput., 21(1), 24\u201345
+ * for one of many descriptions of incomplete Cholesky Factorizations
+ *
+ * @param A The input matrix in CSR format
+ * // param tag An ichol0_tag in order to dispatch among several other preconditioners.
+ */
+template<typename NumericT>
+void precondition(viennacl::compressed_matrix<NumericT> & A, ichol0_tag const & /* tag */)
+{
+ assert( (viennacl::traits::context(A).memory_type() == viennacl::MAIN_MEMORY) && bool("System matrix must reside in main memory for ICHOL0") );
+
+ NumericT * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2());
+
+ //std::cout << A.size1() << std::endl;
+ for (vcl_size_t i=0; i<A.size1(); ++i)
+ {
+ unsigned int row_i_begin = row_buffer[i];
+ unsigned int row_i_end = row_buffer[i+1];
+
+ // get a_ii:
+ NumericT a_ii = 0;
+ for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+ {
+ if (col_buffer[buf_index_aii] == i)
+ {
+ a_ii = std::sqrt(elements[buf_index_aii]);
+ elements[buf_index_aii] = a_ii;
+ break;
+ }
+ }
+
+ // Now scale column/row i, i.e. A(k, i) /= A(i, i)
+ for (unsigned int buf_index_aii = row_i_begin; buf_index_aii < row_i_end; ++buf_index_aii)
+ {
+ if (col_buffer[buf_index_aii] > i)
+ elements[buf_index_aii] /= a_ii;
+ }
+
+ // Now compute A(k, j) -= A(k, i) * A(j, i) for all nonzero k, j in column i:
+ for (unsigned int buf_index_j = row_i_begin; buf_index_j < row_i_end; ++buf_index_j)
+ {
+ unsigned int j = col_buffer[buf_index_j];
+ if (j <= i)
+ continue;
+
+ NumericT a_ji = elements[buf_index_j];
+
+ for (unsigned int buf_index_k = row_i_begin; buf_index_k < row_i_end; ++buf_index_k)
+ {
+ unsigned int k = col_buffer[buf_index_k];
+ if (k < j)
+ continue;
+
+ NumericT a_ki = elements[buf_index_k];
+
+ //Now check whether A(k, j) is in nonzero pattern:
+ unsigned int row_j_begin = row_buffer[j];
+ unsigned int row_j_end = row_buffer[j+1];
+ for (unsigned int buf_index_kj = row_j_begin; buf_index_kj < row_j_end; ++buf_index_kj)
+ {
+ if (col_buffer[buf_index_kj] == k)
+ {
+ elements[buf_index_kj] -= a_ki * a_ji;
+ break;
+ }
+ }
+ }
+ }
+
+ }
+
+}
+
+
+/** @brief Incomplete Cholesky preconditioner class with static pattern (ICHOL0), can be supplied to solve()-routines
+*/
+template<typename MatrixT>
+class ichol0_precond
+{
+ typedef typename MatrixT::value_type NumericType;
+
+public:
+ ichol0_precond(MatrixT const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::context(viennacl::MAIN_MEMORY))
+ {
+ //initialize preconditioner:
+ //std::cout << "Start CPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End CPU precond" << std::endl;
+ }
+
+ template<typename VectorT>
+ void apply(VectorT & vec) const
+ {
+ unsigned int const * row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle1());
+ unsigned int const * col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(LLT.handle2());
+ NumericType const * elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericType>(LLT.handle());
+
+ // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+ viennacl::linalg::host_based::detail::csr_trans_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), lower_tag());
+ viennacl::linalg::host_based::detail::csr_inplace_solve<NumericType>(row_buffer, col_buffer, elements, vec, LLT.size2(), upper_tag());
+ }
+
+private:
+ void init(MatrixT const & mat)
+ {
+ viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+ viennacl::switch_memory_context(LLT, host_ctx);
+
+ viennacl::copy(mat, LLT);
+ viennacl::linalg::precondition(LLT, tag_);
+ }
+
+ ichol0_tag const & tag_;
+ viennacl::compressed_matrix<NumericType> LLT;
+};
+
+
+/** @brief ILU0 preconditioner class, can be supplied to solve()-routines.
+*
+* Specialization for compressed_matrix
+*/
+template<typename NumericT, unsigned int AlignmentV>
+class ichol0_precond< compressed_matrix<NumericT, AlignmentV> >
+{
+ typedef compressed_matrix<NumericT, AlignmentV> MatrixType;
+
+public:
+ ichol0_precond(MatrixType const & mat, ichol0_tag const & tag) : tag_(tag), LLT(mat.size1(), mat.size2(), viennacl::traits::context(mat))
+ {
+ //initialize preconditioner:
+ //std::cout << "Start GPU precond" << std::endl;
+ init(mat);
+ //std::cout << "End GPU precond" << std::endl;
+ }
+
+ void apply(vector<NumericT> & vec) const
+ {
+ if (viennacl::traits::context(vec).memory_type() != viennacl::MAIN_MEMORY)
+ {
+ viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+ viennacl::context old_ctx = viennacl::traits::context(vec);
+
+ viennacl::switch_memory_context(vec, host_ctx);
+ viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+ viennacl::linalg::inplace_solve( LLT , vec, upper_tag());
+ viennacl::switch_memory_context(vec, old_ctx);
+ }
+ else //apply ILU0 directly:
+ {
+ // Note: L is stored in a column-oriented fashion, i.e. transposed w.r.t. the row-oriented layout. Thus, the factorization A = L L^T holds L in the upper triangular part of A.
+ viennacl::linalg::inplace_solve(trans(LLT), vec, lower_tag());
+ viennacl::linalg::inplace_solve( LLT , vec, upper_tag());
+ }
+ }
+
+private:
+ void init(MatrixType const & mat)
+ {
+ viennacl::context host_ctx(viennacl::MAIN_MEMORY);
+ viennacl::switch_memory_context(LLT, host_ctx);
+ LLT = mat;
+
+ viennacl::linalg::precondition(LLT, tag_);
+ }
+
+ ichol0_tag const & tag_;
+ viennacl::compressed_matrix<NumericT> LLT;
+};
+
+}
+}
+
+
+
+
+#endif
+
+
+