You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2016/06/08 21:40:44 UTC
[47/51] [partial] mahout git commit: (nojira) add native-viennaCL module to codebase. closes apache/mahout#241

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
new file mode 100644
index 0000000..e463e88
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/init_matrix.hpp
@@ -0,0 +1,101 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl.hpp"
+#include "viennacl/backend/mem_handle.hpp"
+
+
+
+static ViennaCLStatus init_cuda_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+#ifdef VIENNACL_WITH_CUDA
+  h.switch_active_handle_id(viennacl::CUDA_MEMORY);
+  h.cuda_handle().reset(A->cuda_mem);
+  h.cuda_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)A;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+static ViennaCLStatus init_opencl_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+#ifdef VIENNACL_WITH_OPENCL
+  h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+  h.opencl_handle() = A->opencl_mem;
+  h.opencl_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)A;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+
+static ViennaCLStatus init_host_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+  h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+  h.ram_handle().reset(A->host_mem);
+  h.ram_handle().inc();
+  if (A->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (A->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(A->internal_size1) * static_cast<viennacl::vcl_size_t>(A->internal_size2) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+}
+
+
+static ViennaCLStatus init_matrix(viennacl::backend::mem_handle & h, ViennaCLMatrix A)
+{
+  switch (A->backend->backend_type)
+  {
+    case ViennaCLCUDA:
+      return init_cuda_matrix(h, A);
+
+    case ViennaCLOpenCL:
+      return init_opencl_matrix(h, A);
+
+    case ViennaCLHost:
+      return init_host_matrix(h, A);
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
new file mode 100644
index 0000000..8be00d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/init_vector.hpp
@@ -0,0 +1,101 @@
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include "viennacl.hpp"
+#include "viennacl/backend/mem_handle.hpp"
+
+
+
+static ViennaCLStatus init_cuda_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+#ifdef VIENNACL_WITH_CUDA
+  h.switch_active_handle_id(viennacl::CUDA_MEMORY);
+  h.cuda_handle().reset(x->cuda_mem);
+  h.cuda_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * x->size * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * x->size * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)x;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+static ViennaCLStatus init_opencl_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+#ifdef VIENNACL_WITH_OPENCL
+  h.switch_active_handle_id(viennacl::OPENCL_MEMORY);
+  h.opencl_handle() = x->opencl_mem;
+  h.opencl_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+#else
+  (void)h;
+  (void)x;
+  return ViennaCLGenericFailure;
+#endif
+}
+
+
+static ViennaCLStatus init_host_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+  h.switch_active_handle_id(viennacl::MAIN_MEMORY);
+  h.ram_handle().reset(x->host_mem);
+  h.ram_handle().inc();
+  if (x->precision == ViennaCLFloat)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(float)); // not necessary, but still set for conciseness
+  else if (x->precision == ViennaCLDouble)
+    h.raw_size(static_cast<viennacl::vcl_size_t>(x->inc) * static_cast<viennacl::vcl_size_t>(x->size) * sizeof(double)); // not necessary, but still set for conciseness
+  else
+    return ViennaCLGenericFailure;
+
+  return ViennaCLSuccess;
+}
+
+
+static ViennaCLStatus init_vector(viennacl::backend::mem_handle & h, ViennaCLVector x)
+{
+  switch (x->backend->backend_type)
+  {
+    case ViennaCLCUDA:
+      return init_cuda_vector(h, x);
+
+    case ViennaCLOpenCL:
+      return init_opencl_vector(h, x);
+
+    case ViennaCLHost:
+      return init_host_vector(h, x);
+
+    default:
+      return ViennaCLGenericFailure;
+  }
+}
+
+
+

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp b/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
new file mode 100644
index 0000000..c66c848
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/libviennacl/src/viennacl_private.hpp
@@ -0,0 +1,141 @@
+#ifndef VIENNACL_VIENNACL_PRIVATE_HPP
+#define VIENNACL_VIENNACL_PRIVATE_HPP
+
+
+/* =========================================================================
+   Copyright (c) 2010-2014, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+#include <stdlib.h>
+
+#ifdef VIENNACL_WITH_OPENCL
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif
+
+#include "viennacl.hpp"
+
+
+/************* Backend Management ******************/
+
+struct ViennaCLCUDABackend_impl
+{
+    //TODO: Add stream and/or device descriptors here
+};
+
+struct ViennaCLOpenCLBackend_impl
+{
+  ViennaCLInt context_id;
+};
+
+struct ViennaCLHostBackend_impl
+{
+  // Nothing to specify *at the moment*
+};
+
+
+/** @brief Generic backend for CUDA, OpenCL, host-based stuff */
+struct ViennaCLBackend_impl
+{
+  ViennaCLBackendTypes backend_type;
+
+  ViennaCLCUDABackend_impl     cuda_backend;
+  ViennaCLOpenCLBackend_impl   opencl_backend;
+  ViennaCLHostBackend_impl     host_backend;
+};
+
+
+
+/******** User Types **********/
+
+struct ViennaCLHostScalar_impl
+{
+  ViennaCLPrecision  precision;
+
+  union {
+    float  value_float;
+    double value_double;
+  };
+};
+
+struct ViennaCLScalar_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   offset;
+};
+
+struct ViennaCLVector_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   offset;
+  ViennaCLInt   inc;
+  ViennaCLInt   size;
+};
+
+struct ViennaCLMatrix_impl
+{
+  ViennaCLBackend    backend;
+  ViennaCLPrecision  precision;
+  ViennaCLOrder      order;
+  ViennaCLTranspose  trans;
+
+  // buffer:
+#ifdef VIENNACL_WITH_CUDA
+  char * cuda_mem;
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+  cl_mem opencl_mem;
+#endif
+  char * host_mem;
+
+  ViennaCLInt   size1;
+  ViennaCLInt   start1;
+  ViennaCLInt   stride1;
+  ViennaCLInt   internal_size1;
+
+  ViennaCLInt   size2;
+  ViennaCLInt   start2;
+  ViennaCLInt   stride2;
+  ViennaCLInt   internal_size2;
+};
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
new file mode 100644
index 0000000..ccfd035
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/cpu_ram.hpp
@@ -0,0 +1,171 @@
+#ifndef VIENNACL_BACKEND_CPU_RAM_HPP_
+#define VIENNACL_BACKEND_CPU_RAM_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/cpu_ram.hpp
+    @brief Implementations for the OpenCL backend functionality
+*/
+
+#include <cassert>
+#include <vector>
+#ifdef VIENNACL_WITH_AVX2
+#include <stdlib.h>
+#endif
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+
+namespace viennacl
+{
+namespace backend
+{
+namespace cpu_ram
+{
+typedef viennacl::tools::shared_ptr<char>  handle_type;
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+//                                 dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+//                              dest, offset, size)
+// *
+//
+
+namespace detail
+{
+  /** @brief Helper struct for deleting an pointer to an array */
+  template<class U>
+  struct array_deleter
+  {
+#ifdef VIENNACL_WITH_AVX2
+    void operator()(U* p) const { free(p); }
+#else
+    void operator()(U* p) const { delete[] p; }
+#endif
+  };
+
+}
+
+/** @brief Creates an array of the specified size in main RAM. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes   Number of bytes to allocate
+ * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ *
+ */
+inline handle_type  memory_create(vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+#ifdef VIENNACL_WITH_AVX2
+  // Note: aligned_alloc not available on all compilers. Consider platform-specific alternatives such as posix_memalign()
+  if (!host_ptr)
+    return handle_type(reinterpret_cast<char*>(aligned_alloc(32, size_in_bytes)), detail::array_deleter<char>());
+
+  handle_type new_handle(reinterpret_cast<char*>(aligned_alloc(32, size_in_bytes)), detail::array_deleter<char>());
+#else
+  if (!host_ptr)
+    return handle_type(new char[size_in_bytes], detail::array_deleter<char>());
+
+  handle_type new_handle(new char[size_in_bytes], detail::array_deleter<char>());
+#endif
+
+  // copy data:
+  char * raw_ptr = new_handle.get();
+  const char * data_ptr = static_cast<const char *>(host_ptr);
+#ifdef VIENNACL_WITH_OPENMP
+    #pragma omp parallel for
+#endif
+  for (long i=0; i<long(size_in_bytes); ++i)
+    raw_ptr[i] = data_ptr[i];
+
+  return new_handle;
+}
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'dst_buffer + dst_offset'.
+ *
+ *  @param src_buffer     A smart pointer to the begin of an allocated buffer
+ *  @param dst_buffer     A smart pointer to the end of an allocated buffer
+ *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ *  @param bytes_to_copy  Number of bytes to be copied
+ */
+inline void memory_copy(handle_type const & src_buffer,
+                        handle_type & dst_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t dst_offset,
+                        vcl_size_t bytes_to_copy)
+{
+  assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+  assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<long(bytes_to_copy); ++i)
+    dst_buffer.get()[vcl_size_t(i)+dst_offset] = src_buffer.get()[vcl_size_t(i) + src_offset];
+}
+
+/** @brief Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer    A smart pointer to the beginning of an allocated buffer
+ * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr           Pointer to the first byte to be written
+ */
+inline void memory_write(handle_type & dst_buffer,
+                         vcl_size_t dst_offset,
+                         vcl_size_t bytes_to_copy,
+                         const void * ptr,
+                         bool /*async*/)
+{
+  assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<long(bytes_to_copy); ++i)
+    dst_buffer.get()[vcl_size_t(i)+dst_offset] = static_cast<const char *>(ptr)[i];
+}
+
+/** @brief Reads data from a buffer back to main RAM.
+ *
+ * @param src_buffer         A smart pointer to the beginning of an allocated source buffer
+ * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy      Number of bytes to be read
+ * @param ptr                Location in main RAM where to read data should be written to
+ */
+inline void memory_read(handle_type const & src_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t bytes_to_copy,
+                        void * ptr,
+                        bool /*async*/)
+{
+  assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i=0; i<long(bytes_to_copy); ++i)
+    static_cast<char *>(ptr)[i] = src_buffer.get()[vcl_size_t(i)+src_offset];
+}
+
+}
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
new file mode 100644
index 0000000..641bfea
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/cuda.hpp
@@ -0,0 +1,206 @@
+#ifndef VIENNACL_BACKEND_CUDA_HPP_
+#define VIENNACL_BACKEND_CUDA_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/cuda.hpp
+    @brief Implementations for the CUDA backend functionality
+*/
+
+
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <stdexcept>
+#include <sstream>
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+#define VIENNACL_CUDA_ERROR_CHECK(err)  detail::cuda_error_check (err, __FILE__, __LINE__)
+
+namespace viennacl
+{
+namespace backend
+{
+namespace cuda
+{
+
+typedef viennacl::tools::shared_ptr<char>  handle_type;
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+//                                 dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+//                              dest, offset, size)
+// *
+//
+
+class cuda_exception : public std::runtime_error
+{
+public:
+  cuda_exception(std::string const & what_arg, cudaError_t err_code) : std::runtime_error(what_arg), error_code_(err_code) {}
+
+  cudaError_t error_code() const { return error_code_; }
+
+private:
+  cudaError_t error_code_;
+};
+
+namespace detail
+{
+
+  inline void cuda_error_check(cudaError error_code, const char *file, const int line )
+  {
+    if (cudaSuccess != error_code)
+    {
+      std::stringstream ss;
+      ss << file << "(" << line << "): " << ": CUDA Runtime API error " << error_code << ": " << cudaGetErrorString( error_code ) << std::endl;
+      throw viennacl::backend::cuda::cuda_exception(ss.str(), error_code);
+    }
+  }
+
+
+  /** @brief Functor for deleting a CUDA handle. Used within the smart pointer class. */
+  template<typename U>
+  struct cuda_deleter
+  {
+    void operator()(U * p) const
+    {
+      //std::cout << "Freeing handle " << reinterpret_cast<void *>(p) << std::endl;
+      cudaFree(p);
+    }
+  };
+
+}
+
+/** @brief Creates an array of the specified size on the CUDA device. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes   Number of bytes to allocate
+ * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ *
+ */
+inline handle_type  memory_create(vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+  void * dev_ptr = NULL;
+  VIENNACL_CUDA_ERROR_CHECK( cudaMalloc(&dev_ptr, size_in_bytes) );
+  //std::cout << "Allocated new dev_ptr " << dev_ptr << " of size " <<  size_in_bytes << std::endl;
+
+  if (!host_ptr)
+    return handle_type(reinterpret_cast<char *>(dev_ptr), detail::cuda_deleter<char>());
+
+  handle_type new_handle(reinterpret_cast<char*>(dev_ptr), detail::cuda_deleter<char>());
+
+  // copy data:
+  //std::cout << "Filling new handle from host_ptr " << host_ptr << std::endl;
+  cudaMemcpy(new_handle.get(), host_ptr, size_in_bytes, cudaMemcpyHostToDevice);
+
+  return new_handle;
+}
+
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' on the CUDA device to memory starting at address 'dst_buffer + dst_offset' on the same CUDA device.
+ *
+ *  @param src_buffer     A smart pointer to the begin of an allocated CUDA buffer
+ *  @param dst_buffer     A smart pointer to the end of an allocated CUDA buffer
+ *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ *  @param bytes_to_copy  Number of bytes to be copied
+ */
+inline void memory_copy(handle_type const & src_buffer,
+                        handle_type & dst_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t dst_offset,
+                        vcl_size_t bytes_to_copy)
+{
+  assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+  assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+  cudaMemcpy(reinterpret_cast<void *>(dst_buffer.get() + dst_offset),
+             reinterpret_cast<void *>(src_buffer.get() + src_offset),
+             bytes_to_copy,
+             cudaMemcpyDeviceToDevice);
+}
+
+
+/** @brief Writes data from main RAM identified by 'ptr' to the CUDA buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer    A smart pointer to the beginning of an allocated CUDA buffer
+ * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr           Pointer to the first byte to be written
+ * @param async              Whether the operation should be asynchronous
+ */
+inline void memory_write(handle_type & dst_buffer,
+                         vcl_size_t dst_offset,
+                         vcl_size_t bytes_to_copy,
+                         const void * ptr,
+                         bool async = false)
+{
+  assert( (dst_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+  if (async)
+    cudaMemcpyAsync(reinterpret_cast<char *>(dst_buffer.get()) + dst_offset,
+                    reinterpret_cast<const char *>(ptr),
+                    bytes_to_copy,
+                    cudaMemcpyHostToDevice);
+  else
+    cudaMemcpy(reinterpret_cast<char *>(dst_buffer.get()) + dst_offset,
+               reinterpret_cast<const char *>(ptr),
+               bytes_to_copy,
+               cudaMemcpyHostToDevice);
+}
+
+
+/** @brief Reads data from a CUDA buffer back to main RAM.
+ *
+ * @param src_buffer         A smart pointer to the beginning of an allocated CUDA source buffer
+ * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy      Number of bytes to be read
+ * @param ptr                Location in main RAM where to read data should be written to
+ * @param async              Whether the operation should be asynchronous
+ */
+inline void memory_read(handle_type const & src_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t bytes_to_copy,
+                        void * ptr,
+                        bool async = false)
+{
+  assert( (src_buffer.get() != NULL) && bool("Memory not initialized!"));
+
+  if (async)
+    cudaMemcpyAsync(reinterpret_cast<char *>(ptr),
+                    reinterpret_cast<char *>(src_buffer.get()) + src_offset,
+                    bytes_to_copy,
+                    cudaMemcpyDeviceToHost);
+  else
+    cudaMemcpy(reinterpret_cast<char *>(ptr),
+               reinterpret_cast<char *>(src_buffer.get()) + src_offset,
+               bytes_to_copy,
+               cudaMemcpyDeviceToHost);
+}
+
+} //cuda
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
new file mode 100644
index 0000000..37c680b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/mem_handle.hpp
@@ -0,0 +1,250 @@
+#ifndef VIENNACL_BACKEND_MEM_HANDLE_HPP
+#define VIENNACL_BACKEND_MEM_HANDLE_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/mem_handle.hpp
+    @brief Implements the multi-memory-domain handle
+*/
+
+#include <vector>
+#include <cassert>
+#include "viennacl/forwards.h"
+#include "viennacl/tools/shared_ptr.hpp"
+#include "viennacl/backend/cpu_ram.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/backend/cuda.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+
+namespace detail
+{
+  /** @brief Singleton for managing the default memory type.
+  *
+  * @param new_mem_type    If NULL, returns the current memory type. Otherwise, sets the memory type to the provided value.
+  */
+  inline memory_types get_set_default_memory_type(memory_types * new_mem_type)
+  {
+    // if a user compiles with CUDA, it is reasonable to expect that CUDA should be the default
+#ifdef VIENNACL_WITH_CUDA
+    static memory_types mem_type = CUDA_MEMORY;
+#elif defined(VIENNACL_WITH_OPENCL)
+    static memory_types mem_type = OPENCL_MEMORY;
+#else
+    static memory_types mem_type = MAIN_MEMORY;
+#endif
+
+    if (new_mem_type)
+      mem_type = *new_mem_type;
+
+    return mem_type;
+  }
+}
+
+/** @brief Returns the default memory type for the given configuration.
+ *
+ * CUDA has precedence over OpenCL, which has precedence over main memory. Depends on which VIENNACL_WITH_{CUDA/OPENCL/OPENMP} macros are defined.
+ */
+inline memory_types default_memory_type() { return detail::get_set_default_memory_type(NULL); }
+
+/** @brief Sets the default memory type for the given configuration.
+ *
+ * Make sure the respective new memory type is enabled.
+ * For example, passing CUDA_MEMORY if no CUDA backend is selected will result in exceptions being thrown as soon as you try to allocate buffers.
+ */
+inline memory_types default_memory_type(memory_types new_memory_type) { return detail::get_set_default_memory_type(&new_memory_type); }
+
+
+/** @brief Main abstraction class for multiple memory domains. Represents a buffer in either main RAM, an OpenCL context, or a CUDA device.
+ *
+ * The idea is to wrap all possible handle types inside this class so that higher-level code does not need to be cluttered with preprocessor switches.
+ * Instead, this class collects all the necessary conditional compilations.
+ *
+ */
+class mem_handle
+{
+public:
+  typedef viennacl::tools::shared_ptr<char>      ram_handle_type;
+  typedef viennacl::tools::shared_ptr<char>      cuda_handle_type;
+
+  /** @brief Default CTOR. No memory is allocated */
+  mem_handle() : active_handle_(MEMORY_NOT_INITIALIZED), size_in_bytes_(0) {}
+
+  /** @brief Returns the handle to a buffer in CPU RAM. NULL is returned if no such buffer has been allocated. */
+  ram_handle_type       & ram_handle()       { return ram_handle_; }
+  /** @brief Returns the handle to a buffer in CPU RAM. NULL is returned if no such buffer has been allocated. */
+  ram_handle_type const & ram_handle() const { return ram_handle_; }
+
+#ifdef VIENNACL_WITH_OPENCL
+  /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
+  viennacl::ocl::handle<cl_mem>       & opencl_handle()       { return opencl_handle_; }
+  /** @brief Returns the handle to an OpenCL buffer. The handle contains NULL if no such buffer has been allocated. */
+  viennacl::ocl::handle<cl_mem> const & opencl_handle() const { return opencl_handle_; }
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  /** @brief Returns the handle to a CUDA buffer. The handle contains NULL if no such buffer has been allocated. */
+  cuda_handle_type       & cuda_handle()       { return cuda_handle_; }
+  /** @brief Returns the handle to a CUDA buffer. The handle contains NULL if no such buffer has been allocated. */
+  cuda_handle_type const & cuda_handle() const { return cuda_handle_; }
+#endif
+
+  /** @brief Returns an ID for the currently active memory buffer. Other memory buffers might contain old or no data. */
+  memory_types get_active_handle_id() const { return active_handle_; }
+
+  /** @brief Switches the currently active handle. If no support for that backend is provided, an exception is thrown. */
+  void switch_active_handle_id(memory_types new_id)
+  {
+    if (new_id != active_handle_)
+    {
+      if (active_handle_ == MEMORY_NOT_INITIALIZED)
+        active_handle_ = new_id;
+      else if (active_handle_ == MAIN_MEMORY)
+      {
+        active_handle_ = new_id;
+      }
+      else if (active_handle_ == OPENCL_MEMORY)
+      {
+#ifdef VIENNACL_WITH_OPENCL
+        active_handle_ = new_id;
+#else
+        throw memory_exception("compiled without OpenCL suppport!");
+#endif
+      }
+      else if (active_handle_ == CUDA_MEMORY)
+      {
+#ifdef VIENNACL_WITH_CUDA
+        active_handle_ = new_id;
+#else
+        throw memory_exception("compiled without CUDA suppport!");
+#endif
+      }
+      else
+        throw memory_exception("invalid new memory region!");
+    }
+  }
+
+  /** @brief Compares the two handles and returns true if the active memory handles in the two mem_handles point to the same buffer. */
+  bool operator==(mem_handle const & other) const
+  {
+    if (active_handle_ != other.active_handle_)
+      return false;
+
+    switch (active_handle_)
+    {
+    case MAIN_MEMORY:
+      return ram_handle_.get() == other.ram_handle_.get();
+#ifdef VIENNACL_WITH_OPENCL
+    case OPENCL_MEMORY:
+      return opencl_handle_.get() == other.opencl_handle_.get();
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case CUDA_MEMORY:
+      return cuda_handle_.get() == other.cuda_handle_.get();
+#endif
+    default: break;
+    }
+
+    return false;
+  }
+
+  /** @brief Compares the two handles and returns true if the active memory handles in the two mem_handles point a buffer with inferior address
+     * useful to store handles into a map, since they naturally have strong ordering
+     */
+  bool operator<(mem_handle const & other) const
+  {
+    if (active_handle_ != other.active_handle_)
+      return false;
+
+    switch (active_handle_)
+    {
+    case MAIN_MEMORY:
+      return ram_handle_.get() < other.ram_handle_.get();
+#ifdef VIENNACL_WITH_OPENCL
+    case OPENCL_MEMORY:
+      return opencl_handle_.get() < other.opencl_handle_.get();
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case CUDA_MEMORY:
+      return cuda_handle_.get() < other.cuda_handle_.get();
+#endif
+    default: break;
+    }
+
+    return false;
+  }
+
+
+  bool operator!=(mem_handle const & other) const { return !(*this == other); }
+
+  /** @brief Implements a fast swapping method. No data is copied, only the handles are exchanged. */
+  void swap(mem_handle & other)
+  {
+    // swap handle type:
+    memory_types active_handle_tmp = other.active_handle_;
+    other.active_handle_ = active_handle_;
+    active_handle_ = active_handle_tmp;
+
+    // swap ram handle:
+    ram_handle_type ram_handle_tmp = other.ram_handle_;
+    other.ram_handle_ = ram_handle_;
+    ram_handle_ = ram_handle_tmp;
+
+    // swap OpenCL handle:
+#ifdef VIENNACL_WITH_OPENCL
+    opencl_handle_.swap(other.opencl_handle_);
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    cuda_handle_type cuda_handle_tmp = other.cuda_handle_;
+    other.cuda_handle_ = cuda_handle_;
+    cuda_handle_ = cuda_handle_tmp;
+#endif
+  }
+
+  /** @brief Returns the number of bytes of the currently active buffer */
+  vcl_size_t raw_size() const               { return size_in_bytes_; }
+
+  /** @brief Sets the size of the currently active buffer. Use with care! */
+  void        raw_size(vcl_size_t new_size) { size_in_bytes_ = new_size; }
+
+private:
+  memory_types active_handle_;
+  ram_handle_type ram_handle_;
+#ifdef VIENNACL_WITH_OPENCL
+  viennacl::ocl::handle<cl_mem> opencl_handle_;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+  cuda_handle_type        cuda_handle_;
+#endif
+  vcl_size_t size_in_bytes_;
+};
+
+
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
new file mode 100644
index 0000000..d6f29a5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/memory.hpp
@@ -0,0 +1,628 @@
+#ifndef VIENNACL_BACKEND_MEMORY_HPP
+#define VIENNACL_BACKEND_MEMORY_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/memory.hpp
+    @brief Main interface routines for memory management
+*/
+
+#include <vector>
+#include <cassert>
+#include "viennacl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+#include "viennacl/context.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/context.hpp"
+#include "viennacl/backend/util.hpp"
+
+#include "viennacl/backend/cpu_ram.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#include "viennacl/ocl/backend.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/backend/cuda.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+
+
+  // if a user compiles with CUDA, it is reasonable to expect that CUDA should be the default
+  /** @brief Synchronizes the execution. finish() will only return after all compute kernels (CUDA, OpenCL) have completed. */
+  inline void finish()
+  {
+#ifdef VIENNACL_WITH_CUDA
+    cudaDeviceSynchronize();
+#endif
+#ifdef VIENNACL_WITH_OPENCL
+    viennacl::ocl::get_queue().finish();
+#endif
+  }
+
+
+
+
+  // Requirements for backend:
+
+  // ---- Memory ----
+  //
+  // * memory_create(size, host_ptr)
+  // * memory_copy(src, dest, offset_src, offset_dest, size)
+  // * memory_write(src, offset, size, ptr)
+  // * memory_read(src, offset, size, ptr)
+  //
+
+  /** @brief Creates an array of the specified size. If the second argument is provided, the buffer is initialized with data from that pointer.
+  *
+  * This is the generic version for CPU RAM, CUDA, and OpenCL. Creates the memory in the currently active memory domain.
+  *
+  * @param handle          The generic wrapper handle for multiple memory domains which will hold the new buffer.
+  * @param size_in_bytes   Number of bytes to allocate
+  * @param ctx             Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+  * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+  *
+  */
+  inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, viennacl::context const & ctx, const void * host_ptr = NULL)
+  {
+    if (size_in_bytes > 0)
+    {
+      if (handle.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+        handle.switch_active_handle_id(ctx.memory_type());
+
+      switch (handle.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        handle.ram_handle() = cpu_ram::memory_create(size_in_bytes, host_ptr);
+        handle.raw_size(size_in_bytes);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        handle.opencl_handle().context(ctx.opencl_context());
+        handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), size_in_bytes, host_ptr);
+        handle.raw_size(size_in_bytes);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        handle.cuda_handle() = cuda::memory_create(size_in_bytes, host_ptr);
+        handle.raw_size(size_in_bytes);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("unknown memory handle!");
+      }
+    }
+  }
+
+  /*
+  inline void memory_create(mem_handle & handle, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+  {
+    viennacl::context  ctx(default_memory_type());
+    memory_create(handle, size_in_bytes, ctx, host_ptr);
+  }*/
+
+
+  /** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' to memory starting at address 'dst_buffer + dst_offset'.
+  *
+  * This is the generic version for CPU RAM, CUDA, and OpenCL. Copies the memory in the currently active memory domain.
+  *
+  *
+  *  @param src_buffer     A smart pointer to the begin of an allocated buffer
+  *  @param dst_buffer     A smart pointer to the end of an allocated buffer
+  *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+  *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+  *  @param bytes_to_copy  Number of bytes to be copied
+  */
+  inline void memory_copy(mem_handle const & src_buffer,
+                          mem_handle & dst_buffer,
+                          vcl_size_t src_offset,
+                          vcl_size_t dst_offset,
+                          vcl_size_t bytes_to_copy)
+  {
+    assert( src_buffer.get_active_handle_id() == dst_buffer.get_active_handle_id() && bool("memory_copy() must be called on buffers from the same domain") );
+
+    if (bytes_to_copy > 0)
+    {
+      switch (src_buffer.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        cpu_ram::memory_copy(src_buffer.ram_handle(), dst_buffer.ram_handle(), src_offset, dst_offset, bytes_to_copy);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        opencl::memory_copy(src_buffer.opencl_handle(), dst_buffer.opencl_handle(), src_offset, dst_offset, bytes_to_copy);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        cuda::memory_copy(src_buffer.cuda_handle(), dst_buffer.cuda_handle(), src_offset, dst_offset, bytes_to_copy);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("unknown memory handle!");
+      }
+    }
+  }
+
+  // TODO: Refine this concept. Maybe move to constructor?
+  /** @brief A 'shallow' copy operation from an initialized buffer to an uninitialized buffer.
+   * The uninitialized buffer just copies the raw handle.
+   */
+  inline void memory_shallow_copy(mem_handle const & src_buffer,
+                                  mem_handle & dst_buffer)
+  {
+    assert( (dst_buffer.get_active_handle_id() == MEMORY_NOT_INITIALIZED) && bool("Shallow copy on already initialized memory not supported!"));
+
+    switch (src_buffer.get_active_handle_id())
+    {
+    case MAIN_MEMORY:
+      dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+      dst_buffer.ram_handle() = src_buffer.ram_handle();
+      dst_buffer.raw_size(src_buffer.raw_size());
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+    case OPENCL_MEMORY:
+      dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+      dst_buffer.opencl_handle() = src_buffer.opencl_handle();
+      dst_buffer.raw_size(src_buffer.raw_size());
+      break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+    case CUDA_MEMORY:
+      dst_buffer.switch_active_handle_id(src_buffer.get_active_handle_id());
+      dst_buffer.cuda_handle() = src_buffer.cuda_handle();
+      dst_buffer.raw_size(src_buffer.raw_size());
+      break;
+#endif
+    case MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("unknown memory handle!");
+    }
+  }
+
+  /** @brief Writes data from main RAM identified by 'ptr' to the buffer identified by 'dst_buffer'
+  *
+  * This is the generic version for CPU RAM, CUDA, and OpenCL. Writes the memory in the currently active memory domain.
+  *
+  * @param dst_buffer     A smart pointer to the beginning of an allocated buffer
+  * @param dst_offset     Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+  * @param bytes_to_write Number of bytes to be written
+  * @param ptr            Pointer to the first byte to be written
+  * @param async              Whether the operation should be asynchronous
+  */
+  inline void memory_write(mem_handle & dst_buffer,
+                           vcl_size_t dst_offset,
+                           vcl_size_t bytes_to_write,
+                           const void * ptr,
+                           bool async = false)
+  {
+    if (bytes_to_write > 0)
+    {
+      switch (dst_buffer.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        cpu_ram::memory_write(dst_buffer.ram_handle(), dst_offset, bytes_to_write, ptr, async);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        opencl::memory_write(dst_buffer.opencl_handle(), dst_offset, bytes_to_write, ptr, async);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        cuda::memory_write(dst_buffer.cuda_handle(), dst_offset, bytes_to_write, ptr, async);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("unknown memory handle!");
+      }
+    }
+  }
+
+  /** @brief Reads data from a buffer back to main RAM.
+  *
+  * This is the generic version for CPU RAM, CUDA, and OpenCL. Reads the memory from the currently active memory domain.
+  *
+  * @param src_buffer         A smart pointer to the beginning of an allocated source buffer
+  * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+  * @param bytes_to_read      Number of bytes to be read
+  * @param ptr                Location in main RAM where to read data should be written to
+  * @param async              Whether the operation should be asynchronous
+  */
+  inline void memory_read(mem_handle const & src_buffer,
+                          vcl_size_t src_offset,
+                          vcl_size_t bytes_to_read,
+                          void * ptr,
+                          bool async = false)
+  {
+    //finish(); //Fixes some issues with AMD APP SDK. However, might sacrifice a few percents of performance in some cases.
+
+    if (bytes_to_read > 0)
+    {
+      switch (src_buffer.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        cpu_ram::memory_read(src_buffer.ram_handle(), src_offset, bytes_to_read, ptr, async);
+        break;
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        opencl::memory_read(src_buffer.opencl_handle(), src_offset, bytes_to_read, ptr, async);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        cuda::memory_read(src_buffer.cuda_handle(), src_offset, bytes_to_read, ptr, async);
+        break;
+#endif
+      case MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("unknown memory handle!");
+      }
+    }
+  }
+
+
+
+  namespace detail
+  {
+    template<typename T>
+    vcl_size_t element_size(memory_types /* mem_type */)
+    {
+      return sizeof(T);
+    }
+
+
+    template<>
+    inline vcl_size_t element_size<unsigned long>(memory_types
+                                            #ifdef VIENNACL_WITH_OPENCL
+                                                  mem_type  //in order to compile cleanly at -Wextra in GCC
+                                            #endif
+                                                  )
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (mem_type == OPENCL_MEMORY)
+        return sizeof(cl_ulong);
+#endif
+      return sizeof(unsigned long);
+    }
+
+    template<>
+    inline vcl_size_t element_size<long>(memory_types
+                                   #ifdef VIENNACL_WITH_OPENCL
+                                         mem_type  //in order to compile cleanly at -Wextra in GCC
+                                   #endif
+                                         )
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (mem_type == OPENCL_MEMORY)
+        return sizeof(cl_long);
+#endif
+      return sizeof(long);
+    }
+
+
+    template<>
+    inline vcl_size_t element_size<unsigned int>(memory_types
+                                           #ifdef VIENNACL_WITH_OPENCL
+                                                 mem_type  //in order to compile cleanly at -Wextra in GCC
+                                           #endif
+                                                 )
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (mem_type == OPENCL_MEMORY)
+        return sizeof(cl_uint);
+#endif
+      return sizeof(unsigned int);
+    }
+
+    template<>
+    inline vcl_size_t element_size<int>(memory_types
+                                  #ifdef VIENNACL_WITH_OPENCL
+                                        mem_type  //in order to compile cleanly at -Wextra in GCC
+                                  #endif
+                                        )
+    {
+#ifdef VIENNACL_WITH_OPENCL
+      if (mem_type == OPENCL_MEMORY)
+        return sizeof(cl_int);
+#endif
+      return sizeof(int);
+    }
+
+
+  }
+
+
+  /** @brief Switches the active memory domain within a memory handle. Data is copied if the new active domain differs from the old one. Memory in the source handle is not free'd. */
+  template<typename DataType>
+  void switch_memory_context(mem_handle & handle, viennacl::context new_ctx)
+  {
+    if (handle.get_active_handle_id() == new_ctx.memory_type())
+      return;
+
+    if (handle.get_active_handle_id() == viennacl::MEMORY_NOT_INITIALIZED || handle.raw_size() == 0)
+    {
+      handle.switch_active_handle_id(new_ctx.memory_type());
+#ifdef VIENNACL_WITH_OPENCL
+      if (new_ctx.memory_type() == OPENCL_MEMORY)
+        handle.opencl_handle().context(new_ctx.opencl_context());
+#endif
+      return;
+    }
+
+    vcl_size_t size_dst = detail::element_size<DataType>(handle.get_active_handle_id());
+    vcl_size_t size_src = detail::element_size<DataType>(new_ctx.memory_type());
+
+    if (size_dst != size_src)  // OpenCL data element size not the same as host data element size
+    {
+      throw memory_exception("Heterogeneous data element sizes not yet supported!");
+    }
+    else //no data conversion required
+    {
+      if (handle.get_active_handle_id() == MAIN_MEMORY) //we can access the existing data directly
+      {
+        switch (new_ctx.memory_type())
+        {
+#ifdef VIENNACL_WITH_OPENCL
+        case OPENCL_MEMORY:
+          handle.opencl_handle().context(new_ctx.opencl_context());
+          handle.opencl_handle() = opencl::memory_create(handle.opencl_handle().context(), handle.raw_size(), handle.ram_handle().get());
+          break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+        case CUDA_MEMORY:
+          handle.cuda_handle() = cuda::memory_create(handle.raw_size(), handle.ram_handle().get());
+          break;
+#endif
+        case MAIN_MEMORY:
+        default:
+          throw memory_exception("Invalid destination domain");
+        }
+      }
+#ifdef VIENNACL_WITH_OPENCL
+      else if (handle.get_active_handle_id() == OPENCL_MEMORY) // data can be dumped into destination directly
+      {
+        std::vector<DataType> buffer;
+
+        switch (new_ctx.memory_type())
+        {
+        case MAIN_MEMORY:
+          handle.ram_handle() = cpu_ram::memory_create(handle.raw_size());
+          opencl::memory_read(handle.opencl_handle(), 0, handle.raw_size(), handle.ram_handle().get());
+          break;
+#ifdef VIENNACL_WITH_CUDA
+        case CUDA_MEMORY:
+          buffer.resize(handle.raw_size() / sizeof(DataType));
+          opencl::memory_read(handle.opencl_handle(), 0, handle.raw_size(), &(buffer[0]));
+          cuda::memory_create(handle.cuda_handle(), handle.raw_size(), &(buffer[0]));
+          break;
+#endif
+        default:
+          throw memory_exception("Invalid destination domain");
+        }
+      }
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      else //CUDA_MEMORY
+      {
+        std::vector<DataType> buffer;
+
+        // write
+        switch (new_ctx.memory_type())
+        {
+        case MAIN_MEMORY:
+          handle.ram_handle() = cpu_ram::memory_create(handle.raw_size());
+          cuda::memory_read(handle.cuda_handle(), 0, handle.raw_size(), handle.ram_handle().get());
+          break;
+#ifdef VIENNACL_WITH_OPENCL
+        case OPENCL_MEMORY:
+          buffer.resize(handle.raw_size() / sizeof(DataType));
+          cuda::memory_read(handle.cuda_handle(), 0, handle.raw_size(), &(buffer[0]));
+          handle.opencl_handle() = opencl::memory_create(handle.raw_size(), &(buffer[0]));
+          break;
+#endif
+        default:
+          throw memory_exception("Unsupported source memory domain");
+        }
+      }
+#endif
+
+      // everything succeeded so far, now switch to new domain:
+      handle.switch_active_handle_id(new_ctx.memory_type());
+
+    } // no data conversion
+  }
+
+
+
+  /** @brief Copies data of the provided 'DataType' from 'handle_src' to 'handle_dst' and converts the data if the binary representation of 'DataType' among the memory domains differs. */
+  template<typename DataType>
+  void typesafe_memory_copy(mem_handle const & handle_src, mem_handle & handle_dst)
+  {
+    if (handle_dst.get_active_handle_id() == MEMORY_NOT_INITIALIZED)
+      handle_dst.switch_active_handle_id(default_memory_type());
+
+    vcl_size_t element_size_src = detail::element_size<DataType>(handle_src.get_active_handle_id());
+    vcl_size_t element_size_dst = detail::element_size<DataType>(handle_dst.get_active_handle_id());
+
+    if (element_size_src != element_size_dst)
+    {
+      // Data needs to be converted.
+
+      typesafe_host_array<DataType> buffer_src(handle_src);
+      typesafe_host_array<DataType> buffer_dst(handle_dst, handle_src.raw_size() / element_size_src);
+
+      //
+      // Step 1: Fill buffer_dst depending on where the data resides:
+      //
+      DataType const * src_data;
+      switch (handle_src.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        src_data = reinterpret_cast<DataType const *>(handle_src.ram_handle().get());
+        for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+          buffer_dst.set(i, src_data[i]);
+        break;
+
+#ifdef VIENNACL_WITH_OPENCL
+      case OPENCL_MEMORY:
+        buffer_src.resize(handle_src, handle_src.raw_size() / element_size_src);
+        opencl::memory_read(handle_src.opencl_handle(), 0, buffer_src.raw_size(), buffer_src.get());
+        for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+          buffer_dst.set(i, buffer_src[i]);
+        break;
+#endif
+#ifdef VIENNACL_WITH_CUDA
+      case CUDA_MEMORY:
+        buffer_src.resize(handle_src, handle_src.raw_size() / element_size_src);
+        cuda::memory_read(handle_src.cuda_handle(), 0, buffer_src.raw_size(), buffer_src.get());
+        for (vcl_size_t i=0; i<buffer_dst.size(); ++i)
+          buffer_dst.set(i, buffer_src[i]);
+        break;
+#endif
+
+      default:
+        throw memory_exception("unsupported memory domain");
+      }
+
+      //
+      // Step 2: Write to destination
+      //
+      if (handle_dst.raw_size() == buffer_dst.raw_size())
+        viennacl::backend::memory_write(handle_dst, 0, buffer_dst.raw_size(), buffer_dst.get());
+      else
+        viennacl::backend::memory_create(handle_dst, buffer_dst.raw_size(), viennacl::traits::context(handle_dst), buffer_dst.get());
+
+    }
+    else
+    {
+      // No data conversion required.
+      typesafe_host_array<DataType> buffer(handle_src);
+
+      switch (handle_src.get_active_handle_id())
+      {
+      case MAIN_MEMORY:
+        switch (handle_dst.get_active_handle_id())
+        {
+        case MAIN_MEMORY:
+        case OPENCL_MEMORY:
+        case CUDA_MEMORY:
+          if (handle_dst.raw_size() == handle_src.raw_size())
+            viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), handle_src.ram_handle().get());
+          else
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst), handle_src.ram_handle().get());
+          break;
+
+        default:
+          throw memory_exception("unsupported destination memory domain");
+        }
+        break;
+
+      case OPENCL_MEMORY:
+        switch (handle_dst.get_active_handle_id())
+        {
+        case MAIN_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), handle_dst.ram_handle().get());
+          break;
+
+        case OPENCL_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          viennacl::backend::memory_copy(handle_src, handle_dst, 0, 0, handle_src.raw_size());
+          break;
+
+        case CUDA_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          buffer.resize(handle_src, handle_src.raw_size() / element_size_src);
+          viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), buffer.get());
+          viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), buffer.get());
+          break;
+
+        default:
+          throw memory_exception("unsupported destination memory domain");
+        }
+        break;
+
+      case CUDA_MEMORY:
+        switch (handle_dst.get_active_handle_id())
+        {
+        case MAIN_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), handle_dst.ram_handle().get());
+          break;
+
+        case OPENCL_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          buffer.resize(handle_src, handle_src.raw_size() / element_size_src);
+          viennacl::backend::memory_read(handle_src, 0, handle_src.raw_size(), buffer.get());
+          viennacl::backend::memory_write(handle_dst, 0, handle_src.raw_size(), buffer.get());
+          break;
+
+        case CUDA_MEMORY:
+          if (handle_dst.raw_size() != handle_src.raw_size())
+            viennacl::backend::memory_create(handle_dst, handle_src.raw_size(), viennacl::traits::context(handle_dst));
+          viennacl::backend::memory_copy(handle_src, handle_dst, 0, 0, handle_src.raw_size());
+          break;
+
+        default:
+          throw memory_exception("unsupported destination memory domain");
+        }
+        break;
+
+      default:
+        throw memory_exception("unsupported source memory domain");
+      }
+
+    }
+  }
+
+
+} //backend
+
+//
+// Convenience layer:
+//
+/** @brief Generic convenience routine for migrating data of an object to a new memory domain */
+template<typename T>
+void switch_memory_context(T & obj, viennacl::context new_ctx)
+{
+  obj.switch_memory_context(new_ctx);
+}
+
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
new file mode 100644
index 0000000..a8be55a
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/opencl.hpp
@@ -0,0 +1,151 @@
+#ifndef VIENNACL_BACKEND_OPENCL_HPP_
+#define VIENNACL_BACKEND_OPENCL_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/opencl.hpp
+    @brief Implementations for the OpenCL backend functionality
+*/
+
+
+#include <vector>
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+namespace viennacl
+{
+namespace backend
+{
+namespace opencl
+{
+
+// Requirements for backend:
+
+// * memory_create(size, host_ptr)
+// * memory_copy(src, dest, offset_src, offset_dest, size)
+// * memory_write_from_main_memory(src, offset, size,
+//                                 dest, offset, size)
+// * memory_read_to_main_memory(src, offset, size
+//                              dest, offset, size)
+// *
+//
+
+/** @brief Creates an array of the specified size in the current OpenCL context. If the second argument is provided, the buffer is initialized with data from that pointer.
+ *
+ * @param size_in_bytes   Number of bytes to allocate
+ * @param host_ptr        Pointer to data which will be copied to the new array. Must point to at least 'size_in_bytes' bytes of data.
+ * @param ctx             Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
+ *
+ */
+inline cl_mem memory_create(viennacl::ocl::context const & ctx, vcl_size_t size_in_bytes, const void * host_ptr = NULL)
+{
+  //std::cout << "Creating buffer (" << size_in_bytes << " bytes) host buffer " << host_ptr << " in context " << &ctx << std::endl;
+  return ctx.create_memory_without_smart_handle(CL_MEM_READ_WRITE, static_cast<unsigned int>(size_in_bytes), const_cast<void *>(host_ptr));
+}
+
+/** @brief Copies 'bytes_to_copy' bytes from address 'src_buffer + src_offset' in the OpenCL context to memory starting at address 'dst_buffer + dst_offset' in the same OpenCL context.
+ *
+ *  @param src_buffer     A smart pointer to the begin of an allocated OpenCL buffer
+ *  @param dst_buffer     A smart pointer to the end of an allocated OpenCL buffer
+ *  @param src_offset     Offset of the first byte to be written from the address given by 'src_buffer' (in bytes)
+ *  @param dst_offset     Offset of the first byte to be written to the address given by 'dst_buffer' (in bytes)
+ *  @param bytes_to_copy  Number of bytes to be copied
+ */
+inline void memory_copy(viennacl::ocl::handle<cl_mem> const & src_buffer,
+                        viennacl::ocl::handle<cl_mem> & dst_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t dst_offset,
+                        vcl_size_t bytes_to_copy)
+{
+  assert( &src_buffer.context() == &dst_buffer.context() && bool("Transfer between memory buffers in different contexts not supported yet!"));
+
+  viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context());
+  cl_int err = clEnqueueCopyBuffer(memory_context.get_queue().handle().get(),
+                                   src_buffer.get(),
+                                   dst_buffer.get(),
+                                   src_offset,
+                                   dst_offset,
+                                   bytes_to_copy,
+                                   0, NULL, NULL);  //events
+  VIENNACL_ERR_CHECK(err);
+}
+
+
+/** @brief Writes data from main RAM identified by 'ptr' to the OpenCL buffer identified by 'dst_buffer'
+ *
+ * @param dst_buffer    A smart pointer to the beginning of an allocated OpenCL buffer
+ * @param dst_offset    Offset of the first written byte from the beginning of 'dst_buffer' (in bytes)
+ * @param bytes_to_copy Number of bytes to be copied
+ * @param ptr           Pointer to the first byte to be written
+ * @param async         Whether the operation should be asynchronous
+ */
+inline void memory_write(viennacl::ocl::handle<cl_mem> & dst_buffer,
+                         vcl_size_t dst_offset,
+                         vcl_size_t bytes_to_copy,
+                         const void * ptr,
+                         bool async = false)
+{
+
+  viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(dst_buffer.context());
+
+#if defined(VIENNACL_DEBUG_ALL) || defined(VIENNACL_DEBUG_DEVICE)
+  std::cout << "Writing data (" << bytes_to_copy << " bytes, offset " << dst_offset << ") to OpenCL buffer " << dst_buffer.get() << " with queue " << memory_context.get_queue().handle().get() << " from " << ptr << std::endl;
+#endif
+
+  cl_int err = clEnqueueWriteBuffer(memory_context.get_queue().handle().get(),
+                                    dst_buffer.get(),
+                                    async ? CL_FALSE : CL_TRUE,             //blocking
+                                    dst_offset,
+                                    bytes_to_copy,
+                                    ptr,
+                                    0, NULL, NULL);      //events
+  VIENNACL_ERR_CHECK(err);
+}
+
+
+/** @brief Reads data from an OpenCL buffer back to main RAM.
+ *
+ * @param src_buffer         A smart pointer to the beginning of an allocated OpenCL source buffer
+ * @param src_offset         Offset of the first byte to be read from the beginning of src_buffer (in bytes_
+ * @param bytes_to_copy      Number of bytes to be read
+ * @param ptr                Location in main RAM where to read data should be written to
+ * @param async         Whether the operation should be asynchronous
+ */
+inline void memory_read(viennacl::ocl::handle<cl_mem> const & src_buffer,
+                        vcl_size_t src_offset,
+                        vcl_size_t bytes_to_copy,
+                        void * ptr,
+                        bool async = false)
+{
+  //std::cout << "Reading data (" << bytes_to_copy << " bytes, offset " << src_offset << ") from OpenCL buffer " << src_buffer.get() << " to " << ptr << std::endl;
+  viennacl::ocl::context & memory_context = const_cast<viennacl::ocl::context &>(src_buffer.context());
+  cl_int err =  clEnqueueReadBuffer(memory_context.get_queue().handle().get(),
+                                    src_buffer.get(),
+                                    async ? CL_FALSE : CL_TRUE,             //blocking
+                                    src_offset,
+                                    bytes_to_copy,
+                                    ptr,
+                                    0, NULL, NULL);      //events
+  VIENNACL_ERR_CHECK(err);
+}
+
+
+}
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp b/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
new file mode 100644
index 0000000..9aaeb2e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/backend/util.hpp
@@ -0,0 +1,268 @@
+#ifndef VIENNACL_BACKEND_UTIL_HPP
+#define VIENNACL_BACKEND_UTIL_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file viennacl/backend/util.hpp
+    @brief Helper functionality for working with different memory domains
+*/
+
+#include <vector>
+#include <cassert>
+
+#include "viennacl/forwards.h"
+#include "viennacl/backend/mem_handle.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/backend/opencl.hpp"
+#endif
+
+
+namespace viennacl
+{
+namespace backend
+{
+namespace detail
+{
+
+  /** @brief Helper struct for converting a type to its OpenCL pendant. */
+  template<typename T>
+  struct convert_to_opencl
+  {
+    typedef T    type;
+    enum { special = 0 };
+  };
+
+#ifdef VIENNACL_WITH_OPENCL
+  template<>
+  struct convert_to_opencl<unsigned int>
+  {
+    typedef cl_uint    type;
+    //enum { special = (sizeof(unsigned int) != sizeof(cl_uint)) };
+    enum { special = 1 };
+  };
+
+  template<>
+  struct convert_to_opencl<int>
+  {
+    typedef cl_int    type;
+    //enum { special = (sizeof(int) != sizeof(cl_int)) };
+    enum { special = 1 };
+  };
+
+
+  template<>
+  struct convert_to_opencl<unsigned long>
+  {
+    typedef cl_ulong    type;
+    //enum { special = (sizeof(unsigned long) != sizeof(cl_ulong)) };
+    enum { special = 1 };
+  };
+
+  template<>
+  struct convert_to_opencl<long>
+  {
+    typedef cl_long    type;
+    //enum { special = (sizeof(long) != sizeof(cl_long)) };
+    enum { special = 1 };
+  };
+#endif
+
+
+} //namespace detail
+
+
+/** @brief Helper class implementing an array on the host. Default case: No conversion necessary */
+template<typename T, bool special = detail::convert_to_opencl<T>::special>
+class typesafe_host_array
+{
+  typedef T                                              cpu_type;
+  typedef typename detail::convert_to_opencl<T>::type    target_type;
+
+public:
+  explicit typesafe_host_array() : bytes_buffer_(NULL), buffer_size_(0) {}
+
+  explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+  {
+    resize(handle, num);
+  }
+
+  ~typesafe_host_array() { delete[] bytes_buffer_; }
+
+  //
+  // Setter and Getter
+  //
+  void * get() { return reinterpret_cast<void *>(bytes_buffer_); }
+  vcl_size_t raw_size() const { return buffer_size_; }
+  vcl_size_t element_size() const  {  return sizeof(cpu_type); }
+  vcl_size_t size() const { return buffer_size_ / element_size(); }
+  template<typename U>
+  void set(vcl_size_t index, U value)
+  {
+    reinterpret_cast<cpu_type *>(bytes_buffer_)[index] = static_cast<cpu_type>(value);
+  }
+
+  //
+  // Resize functionality
+  //
+
+  /** @brief Resize without initializing the new memory */
+  void raw_resize(mem_handle const & /*handle*/, vcl_size_t num)
+  {
+    buffer_size_ = sizeof(cpu_type) * num;
+
+    if (num > 0)
+    {
+      delete[] bytes_buffer_;
+
+      bytes_buffer_ = new char[buffer_size_];
+    }
+  }
+
+  /** @brief Resize including initialization of new memory (cf. std::vector<>) */
+  void resize(mem_handle const & handle, vcl_size_t num)
+  {
+    raw_resize(handle, num);
+
+    if (num > 0)
+    {
+      for (vcl_size_t i=0; i<buffer_size_; ++i)
+        bytes_buffer_[i] = 0;
+    }
+  }
+
+  cpu_type operator[](vcl_size_t index) const
+  {
+    assert(index < size() && bool("index out of bounds"));
+
+    return reinterpret_cast<cpu_type *>(bytes_buffer_)[index];
+  }
+
+private:
+  char * bytes_buffer_;
+  vcl_size_t buffer_size_;
+};
+
+
+
+
+/** @brief Special host array type for conversion between OpenCL types and pure CPU types */
+template<typename T>
+class typesafe_host_array<T, true>
+{
+  typedef T                                              cpu_type;
+  typedef typename detail::convert_to_opencl<T>::type    target_type;
+
+public:
+  explicit typesafe_host_array() : convert_to_opencl_( (default_memory_type() == OPENCL_MEMORY) ? true : false), bytes_buffer_(NULL), buffer_size_(0) {}
+
+  explicit typesafe_host_array(mem_handle const & handle, vcl_size_t num = 0) : convert_to_opencl_(false), bytes_buffer_(NULL), buffer_size_(sizeof(cpu_type) * num)
+  {
+    resize(handle, num);
+  }
+
+  ~typesafe_host_array() { delete[] bytes_buffer_; }
+
+  //
+  // Setter and Getter
+  //
+
+  template<typename U>
+  void set(vcl_size_t index, U value)
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    if (convert_to_opencl_)
+      reinterpret_cast<target_type *>(bytes_buffer_)[index] = static_cast<target_type>(value);
+    else
+#endif
+      reinterpret_cast<cpu_type *>(bytes_buffer_)[index] = static_cast<cpu_type>(value);
+  }
+
+  void * get() { return reinterpret_cast<void *>(bytes_buffer_); }
+  cpu_type operator[](vcl_size_t index) const
+  {
+    assert(index < size() && bool("index out of bounds"));
+#ifdef VIENNACL_WITH_OPENCL
+    if (convert_to_opencl_)
+      return static_cast<cpu_type>(reinterpret_cast<target_type *>(bytes_buffer_)[index]);
+#endif
+    return reinterpret_cast<cpu_type *>(bytes_buffer_)[index];
+  }
+
+  vcl_size_t raw_size() const { return buffer_size_; }
+  vcl_size_t element_size() const
+  {
+#ifdef VIENNACL_WITH_OPENCL
+    if (convert_to_opencl_)
+      return sizeof(target_type);
+#endif
+    return sizeof(cpu_type);
+  }
+  vcl_size_t size() const { return buffer_size_ / element_size(); }
+
+  //
+  // Resize functionality
+  //
+
+  /** @brief Resize without initializing the new memory */
+  void raw_resize(mem_handle const & handle, vcl_size_t num)
+  {
+    buffer_size_ = sizeof(cpu_type) * num;
+    (void)handle; //silence unused variable warning if compiled without OpenCL support
+
+#ifdef VIENNACL_WITH_OPENCL
+    memory_types mem_type = handle.get_active_handle_id();
+    if (mem_type == MEMORY_NOT_INITIALIZED)
+      mem_type = default_memory_type();
+
+    if (mem_type == OPENCL_MEMORY)
+    {
+      convert_to_opencl_ = true;
+      buffer_size_ = sizeof(target_type) * num;
+    }
+#endif
+
+    if (num > 0)
+    {
+      delete[] bytes_buffer_;
+
+      bytes_buffer_ = new char[buffer_size_];
+    }
+  }
+
+  /** @brief Resize including initialization of new memory (cf. std::vector<>) */
+  void resize(mem_handle const & handle, vcl_size_t num)
+  {
+    raw_resize(handle, num);
+
+    if (num > 0)
+    {
+      for (vcl_size_t i=0; i<buffer_size_; ++i)
+        bytes_buffer_[i] = 0;
+    }
+  }
+
+private:
+  bool convert_to_opencl_;
+  char * bytes_buffer_;
+  vcl_size_t buffer_size_;
+};
+
+} //backend
+} //viennacl
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
new file mode 100644
index 0000000..1ee13d5
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/circulant_matrix.hpp
@@ -0,0 +1,359 @@
+#ifndef VIENNACL_CIRCULANT_MATRIX_HPP
+#define VIENNACL_CIRCULANT_MATRIX_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= */
+
+/** @file circulant_matrix.hpp
+    @brief Implementation of the circulant_matrix class for efficient manipulation of circulant matrices.  Experimental.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/vector.hpp"
+#include "viennacl/ocl/backend.hpp"
+
+#include "viennacl/linalg/circulant_matrix_operations.hpp"
+
+#include "viennacl/fft.hpp"
+
+namespace viennacl
+{
+/** @brief A Circulant matrix class
+  *
+  * @tparam NumericT  The underlying scalar type (either float or double)
+  * @tparam AlignmentV   The internal memory size is given by (size()/AlignmentV + 1) * AlignmentV. AlignmentV must be a power of two. Best values or usually 4, 8 or 16, higher values are usually a waste of memory.
+  */
+template<class NumericT, unsigned int AlignmentV>
+class circulant_matrix
+{
+public:
+  typedef viennacl::backend::mem_handle                                                              handle_type;
+  typedef scalar<typename viennacl::tools::CHECK_SCALAR_TEMPLATE_ARGUMENT<NumericT>::ResultType>   value_type;
+
+  /**
+    * @brief The default constructor. Does not allocate any memory.
+    *
+    */
+  explicit circulant_matrix() {}
+
+  /**
+    * @brief         Creates the matrix with the given size
+    *
+    * @param rows      Number of rows of the matrix
+    * @param cols      Number of columns of the matrix
+    */
+  explicit circulant_matrix(vcl_size_t rows, vcl_size_t cols) : elements_(rows)
+  {
+    assert(rows == cols && bool("Circulant matrix must be square!"));
+    (void)cols;  // avoid 'unused parameter' warning in optimized builds
+  }
+
+  /** @brief Resizes the matrix.
+   *   Existing entries can be preserved
+   *
+   * @param sz         New size of matrix
+   * @param preserve   If true, existing values are preserved.
+   */
+  void resize(vcl_size_t sz, bool preserve = true)
+  {
+    elements_.resize(sz, preserve);
+  }
+
+  /** @brief Returns the OpenCL handle
+   *
+   *   @return OpenCL handle
+   */
+  handle_type const & handle() const { return elements_.handle(); }
+
+  /**
+    * @brief Returns an internal viennacl::vector, which represents a circulant matrix elements
+    *
+    */
+  viennacl::vector<NumericT, AlignmentV> & elements() { return elements_; }
+  viennacl::vector<NumericT, AlignmentV> const & elements() const { return elements_; }
+
+  /**
+    * @brief Returns the number of rows of the matrix
+    */
+  vcl_size_t size1() const { return elements_.size(); }
+
+  /**
+    * @brief Returns the number of columns of the matrix
+    */
+  vcl_size_t size2() const { return elements_.size(); }
+
+  /** @brief Returns the internal size of matrix representtion.
+   *   Usually required for launching OpenCL kernels only
+   *
+   *   @return Internal size of matrix representation
+   */
+  vcl_size_t internal_size() const { return elements_.internal_size(); }
+
+  /**
+    * @brief Read-write access to a single element of the matrix
+    *
+    * @param row_index  Row index of accessed element
+    * @param col_index  Column index of accessed element
+    * @return Proxy for matrix entry
+    */
+  entry_proxy<NumericT> operator()(vcl_size_t row_index, vcl_size_t col_index)
+  {
+    long index = static_cast<long>(row_index) - static_cast<long>(col_index);
+
+    assert(row_index < size1() && col_index < size2() && bool("Invalid access"));
+
+    while (index < 0)
+      index += static_cast<long>(size1());
+    return elements_[static_cast<vcl_size_t>(index)];
+  }
+
+  /**
+    * @brief += operation for circulant matrices
+    *
+    * @param that Matrix which will be added
+    * @return Result of addition
+    */
+  circulant_matrix<NumericT, AlignmentV>& operator +=(circulant_matrix<NumericT, AlignmentV>& that)
+  {
+    elements_ += that.elements();
+    return *this;
+  }
+
+private:
+  circulant_matrix(circulant_matrix const &) {}
+  circulant_matrix & operator=(circulant_matrix const & t);
+
+  viennacl::vector<NumericT, AlignmentV> elements_;
+};
+
+/** @brief Copies a circulant matrix from the std::vector to the OpenCL device (either GPU or multi-core CPU)
+  *
+  *
+  * @param cpu_vec   A std::vector on the host.
+  * @param gpu_mat   A circulant_matrix from ViennaCL
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(std::vector<NumericT>& cpu_vec, circulant_matrix<NumericT, AlignmentV>& gpu_mat)
+{
+  assert( (gpu_mat.size1() == 0 || cpu_vec.size() == gpu_mat.size1()) && bool("Size mismatch"));
+  copy(cpu_vec, gpu_mat.elements());
+}
+
+/** @brief Copies a circulant matrix from the OpenCL device (either GPU or multi-core CPU) to the std::vector
+  *
+  *
+  * @param gpu_mat   A circulant_matrix from ViennaCL
+  * @param cpu_vec   A std::vector on the host.
+  */
+template<typename NumericT, unsigned int AlignmentV>
+void copy(circulant_matrix<NumericT, AlignmentV>& gpu_mat, std::vector<NumericT>& cpu_vec)
+{
+  assert(cpu_vec.size() == gpu_mat.size1() && bool("Size mismatch"));
+  copy(gpu_mat.elements(), cpu_vec);
+}
+
+/** @brief Copies a circulant matrix from the OpenCL device (either GPU or multi-core CPU) to the matrix-like object
+  *
+  *
+  * @param circ_src   A circulant_matrix from ViennaCL
+  * @param com_dst   A matrix-like object
+  */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(circulant_matrix<NumericT, AlignmentV>& circ_src, MatrixT& com_dst)
+{
+  vcl_size_t size = circ_src.size1();
+  assert(size == viennacl::traits::size1(com_dst) && bool("Size mismatch"));
+  assert(size == viennacl::traits::size2(com_dst) && bool("Size mismatch"));
+  std::vector<NumericT> tmp(size);
+  copy(circ_src, tmp);
+
+  for (vcl_size_t i = 0; i < size; i++)
+  {
+    for (vcl_size_t j = 0; j < size; j++)
+    {
+      long index = static_cast<long>(i) - static_cast<long>(j);
+      if (index < 0)
+        index += static_cast<long>(size);
+      com_dst(i, j) = tmp[static_cast<vcl_size_t>(index)];
+    }
+  }
+}
+
+/** @brief Copies a the matrix-like object to the circulant matrix from the OpenCL device (either GPU or multi-core CPU)
+  *
+  *
+  * @param com_src   A std::vector on the host
+  * @param circ_dst   A circulant_matrix from ViennaCL
+  */
+template<typename NumericT, unsigned int AlignmentV, typename MatrixT>
+void copy(MatrixT& com_src, circulant_matrix<NumericT, AlignmentV>& circ_dst)
+{
+  assert( (circ_dst.size1() == 0 || circ_dst.size1() == viennacl::traits::size1(com_src)) && bool("Size mismatch"));
+  assert( (circ_dst.size2() == 0 || circ_dst.size2() == viennacl::traits::size2(com_src)) && bool("Size mismatch"));
+
+  vcl_size_t size = viennacl::traits::size1(com_src);
+
+  std::vector<NumericT> tmp(size);
+
+  for (vcl_size_t i = 0; i < size; i++) tmp[i] = com_src(i, 0);
+
+  copy(tmp, circ_dst);
+}
+
+/*namespace linalg
+  {
+    template<typename NumericT, unsigned int AlignmentV, unsigned int VECTOR_AlignmentV>
+    void prod_impl(circulant_matrix<NumericT, AlignmentV> const & mat,
+                    vector<NumericT, VECTOR_AlignmentV> const & vec,
+                    vector<NumericT, VECTOR_AlignmentV>& result) {
+        viennacl::vector<NumericT, VECTOR_AlignmentV> circ(mat.elements().size() * 2);
+        fft::real_to_complex(mat.elements(), circ, mat.elements().size());
+
+        viennacl::vector<NumericT, VECTOR_AlignmentV> tmp(vec.size() * 2);
+        viennacl::vector<NumericT, VECTOR_AlignmentV> tmp2(vec.size() * 2);
+
+        fft::real_to_complex(vec, tmp, vec.size());
+        fft::convolve(circ, tmp, tmp2);
+        fft::complex_to_real(tmp2, result, vec.size());
+    }
+  }*/
+
+/** @brief Prints the matrix. Output is compatible to boost::numeric::ublas
+  *
+  * @param s            STL output stream
+  * @param gpu_matrix   A ViennaCL circulant matrix
+  */
+template<class NumericT, unsigned int AlignmentV>
+std::ostream & operator<<(std::ostream& s, circulant_matrix<NumericT, AlignmentV>& gpu_matrix)
+{
+  vcl_size_t size = gpu_matrix.size1();
+  std::vector<NumericT> tmp(size);
+  copy(gpu_matrix, tmp);
+  s << "[" << size << "," << size << "](";
+
+  for (vcl_size_t i = 0; i < size; i++)
+  {
+    s << "(";
+    for (vcl_size_t j = 0; j < size; j++)
+    {
+      long index = static_cast<long>(i) - static_cast<long>(j);
+      if (index < 0) index = static_cast<long>(size) + index;
+      s << tmp[vcl_size_t(index)];
+      //s << index;
+      if (j < (size - 1)) s << ",";
+    }
+    s << ")";
+  }
+  s << ")";
+  return s;
+}
+
+//
+// Specify available operations:
+//
+
+/** \cond */
+
+namespace linalg
+{
+namespace detail
+{
+  // x = A * y
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      // check for the special case x = A * x
+      if (viennacl::traits::handle(lhs) == viennacl::traits::handle(rhs.rhs()))
+      {
+        viennacl::vector<T> temp(lhs);
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+        lhs = temp;
+      }
+      else
+        viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), lhs);
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+      lhs += temp;
+    }
+  };
+
+  template<typename T, unsigned int A>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_base<T>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), rhs.rhs(), temp);
+      lhs -= temp;
+    }
+  };
+
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_assign, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, lhs);
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_add, vector_expression<const circulant_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs += temp_result;
+    }
+  };
+
+  // x = A * vec_op
+  template<typename T, unsigned int A, typename LHS, typename RHS, typename OP>
+  struct op_executor<vector_base<T>, op_inplace_sub, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> >
+  {
+    static void apply(vector_base<T> & lhs, vector_expression<const circulant_matrix<T, A>, const vector_expression<const LHS, const RHS, OP>, op_prod> const & rhs)
+    {
+      viennacl::vector<T> temp(rhs.rhs());
+      viennacl::vector<T> temp_result(lhs);
+      viennacl::linalg::prod_impl(rhs.lhs(), temp, temp_result);
+      lhs -= temp_result;
+    }
+  };
+
+} // namespace detail
+} // namespace linalg
+
+/** \endcond */
+}
+
+#endif // VIENNACL_CIRCULANT_MATRIX_HPP