You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/08/31 04:37:02 UTC

[2/4] incubator-singa git commit: SINGA-243 ViennaCL backend for OpenCL support

SINGA-243 ViennaCL backend for OpenCL support

 - Replaced current OpenCL backend with ViennaCL libs
 - Removed need for OpenCL C++ headers
 - Updated unit tests files to match changes
 - Updated Dependencies.cmake to match changes

 - Added license information.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/595302a3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/595302a3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/595302a3

Branch: refs/heads/master
Commit: 595302a36e9fee2ef63b57ddf81fe75a5fe00a40
Parents: 65bf582
Author: Tan Li Boon <un...@users.noreply.github.com>
Authored: Wed Aug 17 12:48:18 2016 +0800
Committer: Tan Li Boon <ta...@u.nus.edu>
Committed: Wed Aug 31 10:53:21 2016 +0800

----------------------------------------------------------------------
 .travis.yml                           |    6 +-
 CMakeLists.txt                        |    3 +-
 LICENSE                               |    4 +-
 cmake/Dependencies.cmake              |   25 +-
 cmake/Thirdparty/FindViennaCL.cmake   |   45 ++
 include/singa/core/common.h           |   10 +-
 include/singa/core/device.h           |   56 +-
 include/singa/utils/opencl_utils.h    |  142 +---
 src/core/device/opencl_device.cc      |  182 ++---
 src/core/tensor/tensor_math_opencl.cl |   19 +-
 src/core/tensor/tensor_math_opencl.h  | 1007 ++++++++--------------------
 src/utils/opencl_utils.cc             |   63 --
 test/singa/test_opencl.cc             |  700 ++++++++-----------
 test/singa/test_opencl_device.cc      |  108 +++
 14 files changed, 822 insertions(+), 1548 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/.travis.yml
----------------------------------------------------------------------
diff --git a/.travis.yml b/.travis.yml
index 8b1f89c..d9bba51 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,13 +7,13 @@ before_install:
  - sudo apt-get -qq update
  - sudo apt-get install -qq -y libopenblas-dev libgoogle-glog-dev libprotobuf-dev protobuf-compiler
  - sudo apt-get install -qq -y opencl-headers ocl-icd-*
- - wget https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.9/cl2.hpp
- - sudo mv cl2.hpp /usr/include/CL/
+#- wget https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.9/cl2.hpp
+#- sudo mv cl2.hpp /usr/include/CL/
 #- sudo apt-get install -qq libgtest-dev
 
 before_script:
  - mkdir build && cd build
- - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF -DBUILD_OPENCL_TESTS=OFF
+ - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF
 
 script:
  - make

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f6bea2..611cee4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,9 +43,8 @@ OPTION(USE_CUDNN "Use Cudnn libs" ON)
 OPTION(USE_OPENCV "Use opencv" OFF)
 OPTION(USE_LMDB "Use LMDB libs" OFF)
 OPTION(USE_PYTHON "Generate py wrappers" OFF)
-#OPTION(USE_OPENCL "Use OpenCL" OFF)
+OPTION(USE_OPENCL "Use OpenCL" OFF)
 OPTION(ENABLE_DIST "enable distributed training" OFF)
-#OPTION(BUILD_OPENCL_TESTS "Build OpenCL tests" OFF)
 
 INCLUDE("cmake/Dependencies.cmake")
 INCLUDE("cmake/Utils.cmake")

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/LICENSE
----------------------------------------------------------------------
diff --git a/LICENSE b/LICENSE
index f658def..4f9d1e7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -303,7 +303,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ===========================================================================
 SINGA bundles the following under BSD 2-clause license:
-include/singa/utils/cuda_utils.h, src/core/tensor/distribution.cl
+include/singa/utils/cuda_utils.h
+src/core/tensor/distribution.cl
+cmake/ThirdParty/FindViennaCL.cmake
 
 COPYRIGHT
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index b5fda6d..aa2212b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -50,23 +50,26 @@ IF(USE_CBLAS)
     FIND_PACKAGE(CBLAS REQUIRED)
     INCLUDE_DIRECTORIES(SYSTEM ${CBLAS_INCLUDE_DIR})
     LIST(APPEND SINGA_LINKER_LIBS ${CBLAS_LIBRARIES})
-    MESSAGE(STATUS "FOUND cblas at ${CBLAS_LIBRARIES}")
+    MESSAGE(STATUS "Found cblas at ${CBLAS_LIBRARIES}")
 ENDIF()
 
 IF(USE_OPENCL)
     FIND_PACKAGE(OpenCL REQUIRED)
-    IF(NOT OPENCL_FOUND)
+    IF(NOT OpenCL_FOUND)
         MESSAGE(SEND_ERROR "OpenCL was requested, but not found.")
     ELSE()
-        INCLUDE_DIRECTORIES(SYSTEM ${OpenCL_INCPATH})
-        LIST(APPEND SINGA_LINKER_LIBS ${OPENCL_LIBRARIES})
-        MESSAGE(STATUS "Found OpenCL at ${OPENCL_INCLUDE_DIRS}")
-        IF(NOT OPENCL_HAS_CPP_BINDINGS)
-            MESSAGE(SEND_ERROR "OpenCL C++ bindings cl2.hpp was not found.")
-        ELSE()
-            MESSAGE(STATUS "Found OpenCL C++ bindings.")
-        ENDIF()
-    ENDIF()
+    	MESSAGE(STATUS "Found OpenCL headers at ${OpenCL_INCLUDE_DIRS}")
+    	INCLUDE_DIRECTORIES(SYSTEM ${OpenCL_INCLUDE_DIR})
+    	LIST(APPEND SINGA_LINKER_LIBS ${OpenCL_LIBRARIES})
+	    FIND_PACKAGE(ViennaCL REQUIRED)
+		IF(NOT ViennaCL_FOUND)
+			MESSAGE(SEND_ERROR "ViennaCL is required if OpenCL is enabled.")
+		ELSE()
+			MESSAGE(STATUS "Found ViennaCL headers at ${ViennaCL_INCLUDE_DIR}")
+			INCLUDE_DIRECTORIES(SYSTEM ${ViennaCL_INCLUDE_DIR})
+			LIST(APPEND SINGA_LINKER_LIBS ${ViennaCL_LIBRARIES})
+		ENDIF()
+	ENDIF()
 ENDIF()
 
 FIND_PACKAGE(Glog REQUIRED)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/cmake/Thirdparty/FindViennaCL.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindViennaCL.cmake b/cmake/Thirdparty/FindViennaCL.cmake
new file mode 100644
index 0000000..263c80f
--- /dev/null
+++ b/cmake/Thirdparty/FindViennaCL.cmake
@@ -0,0 +1,45 @@
+# This file is retrieved from caffe/cmake/Modules/FindViennaCL.cmake.
+
+SET(ViennaCL_WITH_OPENCL TRUE)
+
+SET(VIENNACL_INCLUDE_SEARCH_PATHS
+  ..
+  /usr/include
+  /usr/local/include
+  /opt/ViennaCL/include
+  $ENV{VIENNACL_HOME}
+  $ENV{VIENNACL_HOME}/include
+)
+
+FIND_PATH(ViennaCL_INCLUDE_DIR NAMES viennacl/forwards.h PATHS ${VIENNACL_INCLUDE_SEARCH_PATHS})
+
+SET(ViennaCL_FOUND ON)
+
+#    Check include files
+IF(NOT ViennaCL_INCLUDE_DIR)
+    SET(ViennaCL_FOUND OFF)
+    MESSAGE(STATUS "Could not find ViennaCL include. Turning ViennaCL_FOUND off")
+ENDIF()
+
+IF (ViennaCL_FOUND)
+  IF (NOT ViennaCL_FIND_QUIETLY)
+    MESSAGE(STATUS "Found ViennaCL include: ${ViennaCL_INCLUDE_DIR}")
+  ENDIF (NOT ViennaCL_FIND_QUIETLY)
+ELSE (ViennaCL_FOUND)
+  IF (ViennaCL_FIND_REQUIRED)
+    MESSAGE(FATAL_ERROR "Could not find ViennaCL")
+  ENDIF (ViennaCL_FIND_REQUIRED)
+ENDIF (ViennaCL_FOUND)
+
+IF(ViennaCL_WITH_OPENCL)
+  find_package(OpenCL REQUIRED)
+ENDIF(ViennaCL_WITH_OPENCL)
+
+set(ViennaCL_INCLUDE_DIRS ${ViennaCL_INCLUDE_DIR} ${OPENCL_INCLUDE_DIRS})
+set(ViennaCL_LIBRARIES ${OPENCL_LIBRARIES})
+
+MARK_AS_ADVANCED(
+  ViennaCL_INCLUDE_DIR
+  ViennaCL_INCLUDE_DIRS
+  ViennaCL_LIBRARIES
+)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 53a9726..dc552c1 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -36,10 +36,7 @@
 
 
 #ifdef USE_OPENCL
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#include <CL/cl2.hpp>
-#include <unordered_map>
+#include "singa/utils/opencl_utils.h"
 #endif  // USE_OPENCL
 
 using std::atomic;
@@ -110,9 +107,8 @@ typedef struct _Context {
 #endif // USE_CUDA
 
 #ifdef USE_OPENCL
-  std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
-  cl::CommandQueue ocl_cmdq;
-  cl::Context ocl_ctx;
+  // This stores the context ID of the OpenCL context controlled by ViennaCL.
+  long vcl_ctx_id;
 #endif
 
 } Context;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 810d41f..62fa250 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -36,12 +36,6 @@
 #endif // USE_CUDA
 
 #ifdef USE_OPENCL
-// http://github.khronos.org/OpenCL-CLHPP/
-// cl2.hpp includes cl.h, do not re-include.
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#include <unordered_map>
-#include <CL/cl2.hpp>
 #include "singa/utils/opencl_utils.h"
 #endif // USE_OPENCL
 
@@ -217,50 +211,26 @@ public:
   OpenclDevice(int id = 0, int num_executors = 1);
   ~OpenclDevice();
 
-  /// Get the specified kernel.
-  cl::Kernel GetKernel(const std::string& kname, cl_int* status = nullptr);
-
-  /// Get the command queue associated with this device.
-  cl::CommandQueue GetCmdQ() { return cmdq; }
-
-  /// Prints information about all Devices in each Platform.
-  void PrintAllDeviceInfo();
-
-  /// Prints status about CL source code builds.
-  void PrintClBuildInfo(cl::Program &p);
-
 // Overridden, inherited methods
   void SetRandSeed(unsigned seed) override;
 
   void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                       CopyDirection direction, int dst_offset = 0,
                       int src_offset = 0);
-/*
-  void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes = 0,
-                           size_t dst_offset = 0) override;*/
 
 protected:
   /// The OpenCL device that this object represents.
   /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
   /// object.
-  cl::Device this_device;
+  viennacl::ocl::device this_device;
 
   /// Each OpenclDevice has one OpenCL context. It is created along with the
   /// creation of this object.
-  cl::Context ocl_ctx;
-
-  /// The CommandQueue that is associated with this device.
-  /// Since each OpenclDevice contains only one cl::Device and one cl::Context,
-  /// it naturally also contains one cl::CommandQueue that is associated
-  /// with said Device and Context.
-  cl::CommandQueue cmdq;
-
-  /// A list of kernels that has been compiled on this device.
-  std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
+  viennacl::ocl::context vcl_ctx;
 
   /// Searches the given paths for all .cl files and builds
   /// OpenCL programs, then stores them in the Kernels map.
-  void BuildPrograms(const std::string &kdir = cl_src_path);
+  void BuildPrograms(const std::string &kdir);
 
 // Overridden, inherited methods.
 
@@ -280,21 +250,6 @@ protected:
 
 private:
 
-  /// Copies a data block from host to device.
-  /// src: a pointer to an array of data.
-  /// dst: a pointer to a cl::Buffer object.
-  void WriteToDevice(cl::Buffer* dst, const void* src, const size_t size);
-
-  /// Reads a data block from device to host.
-  /// src: a pointer to an cl::Buffer object.
-  /// dst: a pointer to an malloc'ed empty array.
-  void ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size);
-
-  /// Duplicates a block of data on the device.
-  /// src: a pointer to the original cl::Buffer object.
-  /// dst: a pointer to the new cl::Buffer object to copy the data into.
-  void CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size);
-
   static const std::string cl_src_path;
 };
 #endif  // USE_OPENCL
@@ -368,11 +323,6 @@ public:
   /// except the context initialization.
   static bool CheckDevice(const int device_id);
 
-
-private:
-#ifdef USE_OPENCL
-  cl::Platform clPlatform;
-#endif  // USE_OPENCL
 };
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/utils/opencl_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/opencl_utils.h b/include/singa/utils/opencl_utils.h
index 664a9e1..8c05643 100644
--- a/include/singa/utils/opencl_utils.h
+++ b/include/singa/utils/opencl_utils.h
@@ -24,121 +24,47 @@
 
 #ifdef USE_OPENCL
 
-#include <iostream>
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 
-// http://github.khronos.org/OpenCL-CLHPP/
-// cl2.hpp includes cl.h, do not re-include.
 #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 #define CL_HPP_TARGET_OPENCL_VERSION 120
-#include <CL/cl2.hpp>
 
-#define CL_BREAK_ON_FAILURE if (status != CL_SUCCESS) return;
-
-
-inline const char* clGetBuildInfoString(const cl_build_status status) {
-  switch (status) {
-	case CL_BUILD_NONE: return "CL_BUILD_NONE";
-	case CL_BUILD_ERROR: return "CL_BUILD_ERROR";
-	case CL_BUILD_SUCCESS: return "CL_BUILD_SUCCESS";
-	case CL_BUILD_IN_PROGRESS: return "CL_BUILD_IN_PROGRESS";
-	default: return "";
-  }
-}
-
-
-inline const char* clGetErrorString(const cl_int status) {
-
-  switch(status) {
-
-  // Run-time and JIT compiler errors
-  case 0: return "CL_SUCCESS";
-  case -1: return "CL_DEVICE_NOT_FOUND";
-  case -2: return "CL_DEVICE_NOT_AVAILABLE";
-  case -3: return "CL_COMPILER_NOT_AVAILABLE";
-  case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
-  case -5: return "CL_OUT_OF_RESOURCES";
-  case -6: return "CL_OUT_OF_HOST_MEMORY";
-  case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
-  case -8: return "CL_MEM_COPY_OVERLAP";
-  case -9: return "CL_IMAGE_FORMAT_MISMATCH";
-  case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
-  case -11: return "CL_BUILD_PROGRAM_FAILURE";
-  case -12: return "CL_MAP_FAILURE";
-  case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
-  case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
-  case -15: return "CL_COMPILE_PROGRAM_FAILURE";
-  case -16: return "CL_LINKER_NOT_AVAILABLE";
-  case -17: return "CL_LINK_PROGRAM_FAILURE";
-  case -18: return "CL_DEVICE_PARTITION_FAILED";
-  case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-
-  // Compile-time errors
-  case -30: return "CL_INVALID_VALUE";
-  case -31: return "CL_INVALID_DEVICE_TYPE";
-  case -32: return "CL_INVALID_PLATFORM";
-  case -33: return "CL_INVALID_DEVICE";
-  case -34: return "CL_INVALID_CONTEXT";
-  case -35: return "CL_INVALID_QUEUE_PROPERTIES";
-  case -36: return "CL_INVALID_COMMAND_QUEUE";
-  case -37: return "CL_INVALID_HOST_PTR";
-  case -38: return "CL_INVALID_MEM_OBJECT";
-  case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-  case -40: return "CL_INVALID_IMAGE_SIZE";
-  case -41: return "CL_INVALID_SAMPLER";
-  case -42: return "CL_INVALID_BINARY";
-  case -43: return "CL_INVALID_BUILD_OPTIONS";
-  case -44: return "CL_INVALID_PROGRAM";
-  case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
-  case -46: return "CL_INVALID_KERNEL_NAME";
-  case -47: return "CL_INVALID_KERNEL_DEFINITION";
-  case -48: return "CL_INVALID_KERNEL";
-  case -49: return "CL_INVALID_ARG_INDEX";
-  case -50: return "CL_INVALID_ARG_VALUE";
-  case -51: return "CL_INVALID_ARG_SIZE";
-  case -52: return "CL_INVALID_KERNEL_ARGS";
-  case -53: return "CL_INVALID_WORK_DIMENSION";
-  case -54: return "CL_INVALID_WORK_GROUP_SIZE";
-  case -55: return "CL_INVALID_WORK_ITEM_SIZE";
-  case -56: return "CL_INVALID_GLOBAL_OFFSET";
-  case -57: return "CL_INVALID_EVENT_WAIT_LIST";
-  case -58: return "CL_INVALID_EVENT";
-  case -59: return "CL_INVALID_OPERATION";
-  case -60: return "CL_INVALID_GL_OBJECT";
-  case -61: return "CL_INVALID_BUFFER_SIZE";
-  case -62: return "CL_INVALID_MIP_LEVEL";
-  case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
-  case -64: return "CL_INVALID_PROPERTY";
-  case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
-  case -66: return "CL_INVALID_COMPILER_OPTIONS";
-  case -67: return "CL_INVALID_LINKER_OPTIONS";
-  case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
-
-  // Extension errors
-  case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
-  case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
-  case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
-  case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
-  case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
-  case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
-
-  default: return "Unknown OpenCL status";
+#ifndef VIENNACL_WITH_OPENCL
+  #define VIENNACL_WITH_OPENCL
+#endif
+
+#ifndef __APPLE__
+  #include "CL/cl.h"
+#else
+  #include "OpenCL/cl.h"
+#endif
+
+#include <viennacl/backend/opencl.hpp>
+
+#include <viennacl/ocl/device.hpp>
+#include <viennacl/ocl/platform.hpp>
+#include <viennacl/ocl/backend.hpp>
+#include <viennacl/ocl/device_utils.hpp>
+#include <viennacl/ocl/utils.hpp>
+#include <viennacl/ocl/program.hpp>
+#include <viennacl/ocl/kernel.hpp>
+
+
+inline viennacl::ocl::handle<cl_mem>
+WrapHandle(cl_mem in, viennacl::ocl::context *ctx) {
+  if (in != nullptr) {
+    viennacl::ocl::handle<cl_mem> memhandle(in, *ctx);
+    memhandle.inc();
+    return memhandle;
+  } else {
+    cl_int err;
+    cl_mem dummy = clCreateBuffer(ctx->handle().get(), CL_MEM_READ_WRITE, 0,
+                                  nullptr, &err);
+    viennacl::ocl::handle<cl_mem> memhandle(dummy, *ctx);
+    return memhandle;
   }
 }
 
-
-/// Special function used to perform error checking and logging.
-inline bool OCL_CHECK(const cl_int status, const char* what) {
-  if (status == CL_SUCCESS) return true; // Nothing wrong.
-  LOG(ERROR) << status << ": " << clGetErrorString(status) << " " << what << std::endl;
-  return false;
-}
-
-/// Prints information about the specified Platform.
-void PrintPlatformInfo(const cl::Platform &p);
-
-/// Prints information about the specified Device.
-void PrintDeviceInfo(const cl::Device &dev);
-
 #endif // USE_OPENCL
 
 #endif // SINGA_UTILS_OPENCL_UTILS_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/device/opencl_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc
index b941cd2..6b371c4 100644
--- a/src/core/device/opencl_device.cc
+++ b/src/core/device/opencl_device.cc
@@ -23,89 +23,41 @@
 
 #include "singa/core/device.h"
 #include "singa/utils/tinydir.h"
+#include "singa/utils/opencl_utils.h"
 
 #ifdef USE_OPENCL
 
-using std::string;
+using namespace viennacl;
+using namespace viennacl::backend::opencl;
 
 namespace singa {
 
-const string OpenclDevice::cl_src_path = "../src/core/tensor";
+const std::string OpenclDevice::cl_src_path = "../src/core/tensor";
 
 OpenclDevice::OpenclDevice(int id, int num_executors)
 	: Device(id, num_executors) {
+  CHECK_GE(id, 0);
   lang_ = kOpencl;
-  this->kernels = std::make_shared<std::unordered_map<string, cl::Kernel>>();
-
-  // Create the OpenCL Device, Context, and CommandQueue.
-  /// TODO: This merely chooses the first device on the first platform.
-  cl_int status = CL_SUCCESS;
-
-  std::vector<cl::Platform> platforms;
-  status = cl::Platform::get(&platforms);
-  OCL_CHECK(status, "Failed to find any OpenCL platforms!");
-
-  std::vector<cl::Device> devices;
-  status = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
-  OCL_CHECK(status, "Failed to get list of devices from platform!");
-
-  this->this_device = cl::Device(devices[0]);
-  this->ocl_ctx = cl::Context(this_device, nullptr, nullptr, nullptr, &status);
-  OCL_CHECK(status, "Failed to create context!");
-
-  this->cmdq = cl::CommandQueue(ocl_ctx, this_device, CL_QUEUE_PROFILING_ENABLE, &status);
-  OCL_CHECK(status, "Failed to create a command queue!");
-
-  BuildPrograms();
-
-  ctx_.kernels = kernels;
-  ctx_.ocl_cmdq = cmdq;
-  ctx_.ocl_ctx = ocl_ctx;
+  
+  ocl::current_context().build_options("-cl-std=CL1.2");
+  
+  ctx_.vcl_ctx_id = 0;
+  this->this_device = ocl::current_device();
+  
+  BuildPrograms(cl_src_path);
 }
 
 
 OpenclDevice::~OpenclDevice() {
 
   // Flush and finish the command queue.
+  auto cmdq = ocl::current_context().get_queue();
+  
   cmdq.flush();
   cmdq.finish();
 }
 
 
-cl::Kernel OpenclDevice::GetKernel(const std::string& kname, cl_int* status) {
-  if (!status) *status = CL_SUCCESS;
-  if (kernels->find(kname) == kernels->end()) {
-    // TODO: Not found
-    LOG(ERROR) << "Error: Kernel " << kname << " could not be found!";
-    if (!status) *status = CL_INVALID_KERNEL;
-  }
-  return kernels->at(kname);
-}
-
-/*
-void OpenclDevice::PrintAllDeviceInfo() {
-  cl_int status = CL_SUCCESS;
-
-  for (auto dev : devices) {
-    PrintDeviceInfo(d);
-  }
-}
-*/
-
-
-void OpenclDevice::PrintClBuildInfo(cl::Program &p) {
-  cl_int status = CL_SUCCESS;
-
-  auto buildStatus = p.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(&status);
-  for (auto pair : buildStatus)
-	std::cout << clGetBuildInfoString(pair.second) << std::endl;
-
-  auto buildLog = p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&status);
-  for (auto pair : buildLog)
-	std::cout << pair.second << std::endl;
-}
-
-
 void OpenclDevice::SetRandSeed(unsigned seed) { seed = seed; }
 
 
@@ -113,19 +65,33 @@ void OpenclDevice::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                                   CopyDirection direction, int dst_offset, int src_offset) {
   // Pointers must be valid.
   if (!dst || !src) return;
+  
+  auto ocl_ctx = viennacl::ocl::get_context(ctx_.vcl_ctx_id);
 
-  CopyToFrom(dst->mutable_data(), src->data(), nBytes, direction);
+  switch(direction) {
+  case kHostToDevice: {
+    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), &ocl_ctx);
+    memory_write(dst_handle, dst_offset, nBytes, src->data());
+    return;
+  }
+  case kDeviceToHost: {
+    auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), &ocl_ctx);
+    memory_read(src_handle, src_offset, nBytes, dst->mutable_data());
+    return;
+  }
+  case kDeviceToDevice: {
+    auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), &ocl_ctx);
+    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), &ocl_ctx);
+    memory_copy(src_handle, dst_handle, src_offset, dst_offset, nBytes);
+    return;
+  }
+  default:
+    return;
+  }
 }
 
-/*
-void OpenclDevice::CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes, size_t dst_offset) {
-  CopyToFrom(dst->mutable_data(), src, 4, kHostToDevice);
-}
-*/
 
 void OpenclDevice::BuildPrograms(const std::string &kdir) {
-  cl_int status = CL_SUCCESS;
-
   tinydir_dir dir;
   tinydir_open(&dir, kdir.c_str());
 
@@ -137,63 +103,47 @@ void OpenclDevice::BuildPrograms(const std::string &kdir) {
 	  tinydir_next(&dir);
 	  continue;
 	}
-
+	
 	std::ifstream clFile(file.path, std::ios_base::binary);
 	std::stringstream buffer;
 	buffer << clFile.rdbuf();
 	std::string clSrc(buffer.str());
 
-	cl::Program program(this->ocl_ctx, clSrc, false, &status);
-	OCL_CHECK(status, "Program creation failed.");
-	status = program.build({this_device}, "-cl-std=CL1.2");
-	if (status == CL_SUCCESS) {
-	  std::vector<cl::Kernel> built_kernels;
-	  status = program.createKernels(&built_kernels);
-	  OCL_CHECK(status, "Failed to create kernels in built program.");
-
-	  for (auto k : built_kernels) {
-		std::string name = k.getInfo<CL_KERNEL_FUNCTION_NAME>(&status);
-		this->kernels->insert(std::make_pair(name, k));
-	  }
-	} else {
-	  OCL_CHECK(status, "Build failed on source path");
-	  LOG(ERROR) << file.path << std::endl;
-	  PrintClBuildInfo(program);
-	}
+	std::string name(file.name);
+    ocl::current_context().add_program(clSrc, name);
 
 	tinydir_next(&dir);
   }
 }
 
-// Device IO functions.
-// TODO:
-// Research - MapBuffers can improve performance when the device uses shared memory
-// but is more complex to understand. http://stackoverflow.com/questions/22057692/whats-the-difference-between-clenqueuemapbuffer-and-clenqueuewritebuffer
-// Intel graphics (and possibly AMD APUs) should use MapBuffers?
-// https://software.intel.com/en-us/articles/getting-the-most-from-opencl-12-how-to-increase-performance-by-minimizing-buffer-copies-on-intel-processor-graphics
-
 
 void OpenclDevice::DoExec(function<void(Context*)>&& fn, int executor) {
   fn(&ctx_);
 }
 
-// NOTE: ASSUMES dst AND/OR src POINTERS CAN BE CAST TO cl::Buffer POINTERS!
+
 void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
                   CopyDirection direction, Context* ctx) {
   // Pointers must be valid.
   if (!dst || !src) return;
+  
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
 
   switch(direction) {
   case kHostToDevice: {
-    WriteToDevice(static_cast<cl::Buffer*>(dst), src, nBytes);
+    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx);
+    memory_write(dst_handle, 0, nBytes, src);
     return;
   }
   case kDeviceToHost: {
-    ReadFromDevice(dst, static_cast<const cl::Buffer*>(src), nBytes);
+    auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx);
+    memory_read(src_handle, 0, nBytes, dst);
     return;
   }
   case kDeviceToDevice: {
-    CopyDeviceBuffer(static_cast<cl::Buffer*>(dst), static_cast<const cl::Buffer*>(src), nBytes);
+    auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx);
+    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx);
+    memory_copy(src_handle, dst_handle, 0, 0, nBytes);
     return;
   }
   default:
@@ -203,10 +153,7 @@ void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
 
 
 void* OpenclDevice::Malloc(int size) {
-  cl_int status = CL_SUCCESS;
-
-  cl::Buffer* buffer = new cl::Buffer(ocl_ctx, CL_MEM_READ_WRITE, size, nullptr, &status);
-  OCL_CHECK(status, "Unable to allocate memory in OpenCL device.");
+  cl_mem buffer = memory_create(ocl::current_context(), size, nullptr);
 
   return static_cast<void*>(buffer);
 }
@@ -214,33 +161,8 @@ void* OpenclDevice::Malloc(int size) {
 
 void OpenclDevice::Free(void* p) {
   if (!p) return;
-  cl::Buffer* buffer = static_cast<cl::Buffer*>(p);
-  delete buffer;
-}
-
-
-void OpenclDevice::WriteToDevice(cl::Buffer* dst, const void* src, const size_t size) {
-  cl_int status = CL_SUCCESS;
-
-  status = cmdq.enqueueWriteBuffer(*dst, CL_TRUE, 0, size, src);
-  OCL_CHECK(status, "Unable to write data to OpenCL device.");
-}
-
-
-void OpenclDevice::ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size) {
-  cl_int status = CL_SUCCESS;
-
-  status = cmdq.enqueueReadBuffer(*src, CL_TRUE, 0, size, dst);
-  OCL_CHECK(status, "Unable to read data from OpenCL device.");
-}
-
-
-// dst: cl::Buffer pointer    src: cl::Buffer pointer
-void OpenclDevice::CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size) {
-  cl_int status = CL_SUCCESS;
-
-  status = cmdq.enqueueCopyBuffer(*src, *dst, 0, 0, size);
-  OCL_CHECK(status, "Unable to copy buffer in OpenCL device.");
+  cl_mem buffer = static_cast<cl_mem>(p);
+  clReleaseMemObject(buffer);
 }
 
 } // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/tensor/tensor_math_opencl.cl
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.cl b/src/core/tensor/tensor_math_opencl.cl
index f9cf96e..7b89970 100644
--- a/src/core/tensor/tensor_math_opencl.cl
+++ b/src/core/tensor/tensor_math_opencl.cl
@@ -24,7 +24,7 @@
 // This reduction code is serial reduction modified from AMD's example.
 // http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
 __kernel 
-void clkernel_abs(const int num, __global const float* in, __global float* out) {
+void clkernel_fabs(const int num, __global const float* in, __global float* out) {
   const int i = get_global_id(0);
   if (i >= num) return;
   out[i] = fabs(in[i]);
@@ -462,7 +462,7 @@ void clkernel_crossentropy(const uint batchsize, const uint dim,
 
   int truth_idx = t[gidx];
   if (truth_idx <= 0) return;
-  float prob_of_truth = p[gidx + truth_idx];
+  float prob_of_truth = p[gidx * dim + truth_idx];
   loss[gidx] = -log(fmax(prob_of_truth, -FLT_MIN));
 }
 
@@ -480,6 +480,21 @@ void clkernel_softmaxentropy(const uint batchsize, const uint dim,
 }
 
 
+__kernel
+void clkernel_rowmax(const uint nrow, const uint ncol,
+                     __global const float* in, __global float* out) {
+  const uint row_id = get_global_id(0);
+  if (row_id >= nrow) return;
+  
+  float row_max_val = -FLT_MAX;
+  for (uint i = 0; i < ncol; i++) {
+    row_max_val = fmax(row_max_val, in[row_id * ncol + i]);
+  }
+  
+  out[row_id] = row_max_val;
+}
+
+
 // **************************************
 // Matrix functions
 // **************************************

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/tensor/tensor_math_opencl.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h
index c289a56..c387031 100644
--- a/src/core/tensor/tensor_math_opencl.h
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -19,17 +19,27 @@
 #ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
 
 #ifdef USE_OPENCL
-#include <limits>
 
-#include "singa/utils/opencl_utils.h"
 #include "tensor_math.h"
+#include "singa/utils/opencl_utils.h"
 
-namespace singa {
+#include <viennacl/scalar.hpp>
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include <viennacl/linalg/inner_prod.hpp>
+#include <viennacl/linalg/norm_2.hpp>
+#include <viennacl/linalg/sum.hpp>
+#include <viennacl/linalg/scalar_operations.hpp>
+#include <viennacl/linalg/vector_operations.hpp>
+#include <viennacl/linalg/matrix_operations.hpp>
 
-// Some forward declarations of utility functions that only exist here.
-void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx);
-void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx);
-void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx);
+#include <viennacl/ocl/kernel.hpp>
+
+using viennacl::ocl::get_context;
+using viennacl::ocl::enqueue;
+
+namespace singa {
 
 // **************************************
 // Element-wise functions
@@ -37,436 +47,250 @@ void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context*
 
 template<>
 void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_abs";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_fabs");
+  
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  v_out = v_in;
+  enqueue(kernel((cl_int)num, v_in, v_out));
 }
 
 
 template<>
 void Add<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_add_scalar";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, x);
-  kernel.setArg(2, inbuf);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  
+  viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  v_out = v_in + x_in;
 }
 
 
 template<>
 void Add<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_add";
-  auto kernel = ctx->kernels->at(kname);
-  
-  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
-  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, in1buf);
-  kernel.setArg(2, in2buf);
-  kernel.setArg(3, outbuf);
+  viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+  viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = v_in1 + v_in2;
 }
 
 
 template<>
-void Clamp<float, lang::Opencl>(const size_t num, const float low, const float high, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_clamp";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, low);
-  kernel.setArg(2, high);
-  kernel.setArg(3, inbuf);
-  kernel.setArg(4, outbuf);
+void Clamp<float, lang::Opencl>(const size_t num, const float low, const float high,
+                                const Block* in, Block* out, Context* ctx) {
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_clamp");
+  
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  enqueue(kernel((cl_int)num, low, high, v_in, v_out));
 }
 
 
 template<>
 void Div<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_divide_scalar_matx";
-  auto kernel = ctx->kernels->at(kname);
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
   
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+  viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, x);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_div(v_in, x_in);
 }
 
 
 template<>
 void Div<float, lang::Opencl>(const size_t num, const float x, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_divide_scalar_xmat";
-  auto kernel = ctx->kernels->at(kname);
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
   
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, x);
-  kernel.setArg(2, inbuf);
-  kernel.setArg(3, outbuf);
+  viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_div(x_in, v_in);
 }
 
 
 template<>
 void Div<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_divide";
-  auto kernel = ctx->kernels->at(kname);
-  
-  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
-  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, in1buf);
-  kernel.setArg(2, in2buf);
-  kernel.setArg(3, outbuf);
+  viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+  viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_div(v_in1, v_in2);
 }
 
 
 template<>
 void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_eltmult_scalar";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, x);
-  kernel.setArg(2, inbuf);
-  kernel.setArg(3, outbuf);
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  
+  viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_prod(v_in, x_in);
 }
 
 
 template<>
 void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
+  viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+  viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  std::string kname = "clkernel_eltmult";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
-  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, in1buf);
-  kernel.setArg(2, in2buf);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_prod(v_in1, v_in2);
 }
 
 
 template<>
 void Exp<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_exp";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  v_out = viennacl::linalg::element_exp(v_in);
 }
 
 
 template<>
 void LE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_le";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, x);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_le");
+  
+  viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+  viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  enqueue(kernel((cl_int)num, in_buf, x, out_buf));
 }
 
 
 template<>
 void Log<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_log";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  v_out = viennacl::linalg::element_log(v_in);
 }
 
 
 template<>
 void LT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_lt";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, x);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_lt");
+  
+  viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+  viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  enqueue(kernel((cl_int)num, in_buf, x, out_buf));
 }
 
 
 template<>
 void GE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_ge";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, x);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_ge");
+  
+  viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+  viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  enqueue(kernel((cl_int)num, in_buf, x, out_buf));
 }
 
 
 template<>
 void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_gt";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, x);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_gt");
+  
+  viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+  viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  enqueue(kernel((cl_int)num, in_buf, x, out_buf));
 }
 
 
 template<>
 void Pow<float, lang::Opencl>(const size_t num, const Block* in, float x, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_pow_scalar";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, x);
-  kernel.setArg(2, inbuf);
-  kernel.setArg(3, outbuf);
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  
+  viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_pow(v_in, x_in);
 }
 
 
 template<>
 void Pow<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_pow";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
-  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, in1buf);
-  kernel.setArg(2, in2buf);
-  kernel.setArg(3, outbuf);
+  viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+  viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_pow(v_in1, v_in2);
 }
 
 
 template<>
 void ReLU<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_relu";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_relu");
+  
+  viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+  viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  enqueue(kernel((cl_int)num, in_buf, out_buf));
 }
 
+
 template<>
 void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
 
-  std::string kname = "clkernel_set";
-  auto kernel = ctx->kernels->at(kname);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
   
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, x);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::scalar_vector<float>(num, x, ocl_ctx);
 }
 
 
 template<>
 void Sigmoid<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_sigmoid";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  
+  const viennacl::vector<float> zero = viennacl::zero_vector<float>(num, ocl_ctx);
+  const viennacl::vector<float> one = viennacl::scalar_vector<float>(num, 1.0f, ocl_ctx);
+  
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  v_out = viennacl::linalg::element_div(one, viennacl::linalg::element_exp(zero - v_in) + one);
 }
 
 
 template<>
 void Sign<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_sign";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_abs");
+  
+  viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+  viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  enqueue(kernel(num, in_buf, out_buf));
 }
 
 
 template<>
 void Sqrt<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_sqrt";
-  auto kernel = ctx->kernels->at(kname);
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_sqrt(v_in);
 }
 
 
@@ -478,168 +302,85 @@ void Square<float, lang::Opencl>(const size_t num, const Block* in, Block* out,
 
 template<>
 void Sub<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_subtract_scalar";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, x);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  Add<float, lang::Opencl>(num, in, -x, out, ctx);
 }
 
 
 template<>
 void Sub<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
+  viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+  viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  std::string kname = "clkernel_subtract";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
-  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, in1buf);
-  kernel.setArg(2, in2buf);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = v_in1 - v_in2;
 }
 
 
 template<>
 void Sum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
 
-  std::string kname = "clkernel_reduce";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  
-  size_t size = sizeof(float) * num;
-  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
-  OCL_CHECK(status, "Failed to create buffer!");
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outval);
-  kernel.setArg(3, cl::Local(size));
-  
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
-
-  float* temp = new float[num];
-  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
-  OCL_CHECK(status, "Failed to read from buffer!");
-  out[0] = temp[0];
-  delete temp;
+  out[0] = viennacl::linalg::sum(v_in);
 }
 
 
 template<>
 void Tanh<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_tanh";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+  viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_tanh(v_in);
 }
 
 // **************************************
 // Random functions
 // **************************************
 
-/// Seed value required for generating distributions.
-static unsigned int seed[4] = {0, 32, 42, 888};
 /// Number of generation rounds used in the current algorithm.
 static cl_uint rounds = 8;
 
 template<>
 void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "PRNG_threefry4x32_bernoulli";
-  auto kernel = ctx->kernels->at(kname);
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_bernoulli");
   
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, outbuf);
-  kernel.setArg(1, seed);
-  kernel.setArg(2, 0.0f); // inf
-  kernel.setArg(3, 1.0f); // sup
-  kernel.setArg(4, p); // threshold
-  kernel.setArg(5, rounds);
-  kernel.setArg(6, cl_uint(num) / 4);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
   
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888};
+  
+  enqueue(kernel(v_out, seed, 0.0f, 1.0f, p, rounds, cl_uint(num / 4)));
 }
 
 
 template<>
 void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const float std, Block* out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "PRNG_threefry4x32_gaussian";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, outbuf);
-  kernel.setArg(1, seed);
-  kernel.setArg(2, mean); // E
-  kernel.setArg(3, std);  // V
-  kernel.setArg(4, rounds);
-  kernel.setArg(5, cl_uint(num) / 4);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_gaussian");
+  
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888};
+  
+  enqueue(kernel(v_out, seed, mean, std, rounds, cl_uint(num/4)));
 }
 
 
 template<>
 void Uniform<float, lang::Opencl>(const size_t num, const float low, const float high, Block* out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "PRNG_threefry4x32_uniform";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_uniform");
   
-  status = kernel.setArg(0, outbuf); OCL_CHECK(status, "kernel arg 0");
-  status = kernel.setArg(1, seed); OCL_CHECK(status, "kernel arg 1");
-  status = kernel.setArg(2, low); OCL_CHECK(status, "kernel arg 2");
-  status = kernel.setArg(3, high); OCL_CHECK(status, "kernel arg 3");
-  status = kernel.setArg(4, rounds); OCL_CHECK(status, "kernel arg 4");
-  status = kernel.setArg(5, cl_uint(num) / 4); OCL_CHECK(status, "kernel arg 5");
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888};
+  
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  enqueue(kernel(v_out, seed, low, high, rounds, cl_uint(num/4)));
 }
 
 // *********************************************************
 // BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
 // *********************************************************
-
+/*
 template<>
 void Amax<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, Context* ctx) {
   cl_int status = CL_SUCCESS;
@@ -699,7 +440,7 @@ void Amin<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, C
   delete temp;
 }
 
-
+	
 template<>
 void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
   cl_int status = CL_SUCCESS;
@@ -727,256 +468,141 @@ void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Co
   out[0] = temp[0];
   delete temp;
 }
-
-
+*/
+/// out = alpha * in + out
 template<>
 void Axpy<float, lang::Opencl>(const size_t num, const float alpha, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_axpy";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, alpha);
-  kernel.setArg(2, inbuf);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  viennacl::vector<float> inbuf((const cl_mem)in->data(), num);
+  viennacl::vector<float> outbuf(static_cast<cl_mem>(out->mutable_data()), num);
+  
+  outbuf += alpha * inbuf;
 }
 
-
+/// out = ||in||_2^2, i.e, L2 norm.
 template<>
 void Nrm2<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_nrm2";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-
-  size_t size = sizeof(float) * num;
-  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
-  OCL_CHECK(status, "Failed to create buffer!");
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, inbuf);
-  kernel.setArg(2, outval);
-  kernel.setArg(3, cl::Local(sizeof(float) * (std::pow(2, num))));
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
-
-  float* temp = new float[num];
-  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
-  OCL_CHECK(status, "Failed to read from buffer!");
-  out[0] = temp[0];
-  delete temp;
+  viennacl::vector<float> inbuf((const cl_mem)in->data(), num);
+  
+  out[0] = viennacl::linalg::norm_2(inbuf);
 }
 
 
 template<>
 void Scale<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_scale";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, x);
-  kernel.setArg(2, outbuf);
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  
+  viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::element_prod(v_out, x_in);
 }
 
 
 template<>
 void Dot<float, lang::Opencl>(const size_t num, const Block *in1, const Block *in2, float *out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_dot";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
-  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
-
-  size_t size = sizeof(float) * num;
-  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
-  OCL_CHECK(status, "Failed to create buffer!");
-
-  kernel.setArg(0, (cl_int)num);
-  kernel.setArg(1, in1buf);
-  kernel.setArg(2, in2buf);
-  kernel.setArg(3, outval);
-  kernel.setArg(4, cl::Local(size));
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
-
-  float* temp = new float[num];
-  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
-  OCL_CHECK(status, "Failed to read from buffer!");
-  out[0] = temp[0];
-  delete temp;
+  viennacl::vector<float> in1_buf((const cl_mem)in1->data(), num);
+  viennacl::vector<float> in2_buf((const cl_mem)in2->data(), num);
+  
+  out[0] = viennacl::linalg::inner_prod(in1_buf, in2_buf);
 }
 
-
+/// out = alpha * A * v + beta * out.
 template<>
 void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, const float alpha,
 		  const Block *A, const Block *v, const float beta, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_gemv";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
-  cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)m);
-  kernel.setArg(1, (cl_int)n);
-  kernel.setArg(2, alpha);
-  kernel.setArg(3, Abuf);
-  kernel.setArg(4, vbuf);
-  kernel.setArg(5, beta);
-  kernel.setArg(6, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+		  
+  viennacl::matrix<float> A_in((const cl_mem)A->data(), m, n);
+  viennacl::vector<float> v_in((const cl_mem)v->data(), trans ? m : n);
+  viennacl::vector<float> o_in(static_cast<cl_mem>(out->mutable_data()), trans ? n : m);
+  
+  if (trans) viennacl::trans(A_in);
+  
+  o_in *= beta;
+  o_in += alpha * viennacl::linalg::prod(A_in, v_in);
 }
 
 
+/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+/// if matrix_lef_side is true, do M*v; else do v*M
 template<>
 void DGMM<float, lang::Opencl>(bool side_right,
 		  const size_t nrow, const size_t ncol,
 		  const Block *M, const Block *v, Block *out, Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  cl::Buffer Mbuf = *(static_cast<cl::Buffer*>(M->mutable_data()));
-  cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
 
-  std::string kname;
+  viennacl::matrix<float> M_buf((const cl_mem)M->data(), nrow, ncol);
+  viennacl::vector<float> v_buf((const cl_mem)v->data(), nrow);
+  viennacl::matrix<float> out_buf(static_cast<cl_mem>(out->mutable_data()), nrow, ncol);
+  
+  auto diag = viennacl::diag(v_buf);
+  
   if (side_right) {
-	DiagVec_Right(ncol, vbuf, vbuf, ctx);
-	kname = "clkernel_dgmm_right";
+    out_buf = viennacl::linalg::prod(diag, M_buf);
   } else {
-	DiagVec_Left(nrow, vbuf, vbuf, ctx);
-	kname = "clkernel_dgmm_left";
+    out_buf = viennacl::linalg::prod(M_buf, diag);
   }
-
-  auto kernel = ctx->kernels->at(kname);
-
-  kernel.setArg(0, (cl_int)nrow);
-  kernel.setArg(1, (cl_int)ncol);
-  kernel.setArg(2, Mbuf);
-  kernel.setArg(3, vbuf);
-  kernel.setArg(4, outbuf);
-  kernel.setArg(5, cl::Local(sizeof(float) * nrow * ncol));
-
-  cl::NDRange global(nrow); // Only nrow because current implementation is 1 dimensional
-//  cl::NDRange local();
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
 }
 
-
+/// C = alpha * A * B + beta * C.
 template<>
 void GEMM<float, lang::Opencl>(const bool transA, const bool transB,
 		  const size_t nrowA, const size_t ncolB, const size_t ncolA,
 		  const float alpha, const Block *A, const Block *B, const float beta,
 		  Block *C, Context *ctx) {
-  cl_int status = CL_SUCCESS;
 
-  std::string kname = "clkernel_gemm";
-  auto kernel = ctx->kernels->at(kname);
-  
-  cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
-  cl::Buffer Bbuf = *(static_cast<cl::Buffer*>(B->mutable_data()));
-  cl::Buffer Cbuf = *(static_cast<cl::Buffer*>(C->mutable_data()));
-
-  // If matrix A needs to be transposed, do it.
-  if (transA)
-	Transpose(nrowA, ncolA, Abuf, Abuf, ctx);
-
-  // If vector B needs to be transposed, do it.
-  if (transB)
-	Transpose(nrowA, ncolB, Bbuf, Bbuf, ctx);
-
-  kernel.setArg(0, (cl_int)nrowA);
-  kernel.setArg(1, (cl_int)ncolB);
-  kernel.setArg(2, (cl_int)ncolA);
-  kernel.setArg(3, alpha);
-  kernel.setArg(4, Abuf);
-  kernel.setArg(5, Bbuf);
-  kernel.setArg(6, beta);
-  kernel.setArg(7, Cbuf);
-  kernel.setArg(8, cl::Local(sizeof(float) * nrowA * ncolB));
-  kernel.setArg(9, cl::Local(sizeof(float) * nrowA * ncolB));
-  
-// TODO: Try to make the work group size a power of 2 given an arbitrary matrix.
-  cl::NDRange global(nrowA, ncolB);
-  cl::NDRange local(nrowA, ncolB);
+  viennacl::matrix<float> A_buf((const cl_mem)A->data(), nrowA, ncolA);
+  viennacl::matrix<float> B_buf((const cl_mem)B->data(), ncolA, ncolB);
+  viennacl::matrix<float> C_buf(static_cast<cl_mem>(C->mutable_data()), nrowA, ncolB);
+
+  if (transA) viennacl::trans(A_buf);
+  if (transB) viennacl::trans(B_buf);
   
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global, local);
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  C_buf *= beta;
+  C_buf += alpha * viennacl::linalg::prod(A_buf, B_buf);
 }
 
+
 template <>
 void ComputeCrossEntropy<float, lang::Opencl>(const size_t batchsize, const size_t dim,
                          const Block *p, const Block *t, Block *loss,
                          Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_crossentropy";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data()));
-  cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data()));
-  cl::Buffer lossbuf = *(static_cast<cl::Buffer*>(loss->mutable_data()));
-
-  kernel.setArg(0, (cl_uint)batchsize);
-  kernel.setArg(1, (cl_uint)dim);
-  kernel.setArg(2, pbuf);
-  kernel.setArg(3, tbuf);
-  kernel.setArg(4, lossbuf);
-
-  cl::NDRange global(batchsize);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_crossentropy");
+  
+  viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize);
+  viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize);
+  viennacl::vector<float> loss_buf(static_cast<cl_mem>(loss->mutable_data()), batchsize);
+  
+  enqueue(kernel((cl_uint)batchsize, (cl_uint)dim, p_buf, t_buf, loss_buf));
 }
 
+
 template <>
 void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const size_t dim,
                             const Block *p, const Block *t, Block *grad,
                             Context *ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_softmaxentropy";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data()));
-  cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data()));
-  cl::Buffer gradbuf = *(static_cast<cl::Buffer*>(grad->mutable_data()));
-
-  kernel.setArg(0, (cl_uint)batchsize);
-  kernel.setArg(1, (cl_uint)dim);
-  kernel.setArg(2, pbuf);
-  kernel.setArg(3, tbuf);
-  kernel.setArg(4, gradbuf);
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_softmaxentropy");
+  
+  viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize);
+  viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize);
+  viennacl::vector<float> grad_buf(static_cast<cl_mem>(grad->mutable_data()), batchsize);
+  
+  enqueue(kernel((cl_uint)batchsize, (cl_uint)dim, p_buf, t_buf, grad_buf));
+}
 
-  cl::NDRange global(batchsize);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+template<>
+void RowMax<float, lang::Opencl>(const size_t nrow, const size_t ncol,
+                                 const Block *in, Block *out, Context *ctx) {
+  auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_rowmax");
+  
+//  kernel.global_work_size(0, nrow);
+  
+  viennacl::matrix<float> in_buf((const cl_mem)in->data(), nrow, ncol);
+  viennacl::vector<float> outbuf(static_cast<cl_mem>(out->mutable_data()), nrow);
+  
+  enqueue(kernel((cl_uint)nrow, (cl_uint)ncol, in_buf, outbuf));
 }
 
 // **************************************
@@ -985,129 +611,46 @@ void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const s
 /*
 template<>
 void AddCol<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) {
-  std::string kname = "clkernel_addcol";
-  auto kernel = ctx->kernels->at(kname);
-  kernel.setArg(0, (cl_int)nrow);
-  kernel.setArg(1, (cl_int)ncol);
-  kernel.setArg(2, static_cast<const float*>(A->mutable_data()));
-  kernel.setArg(3, static_cast<const float*>(v->mutable_data()));
-  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
 
-  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
 }
 
+
 template<>
 void AddRow<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) {
-  std::string kname = "clkernel_addrow";
-  auto kernel = ctx->kernels->at(kname);
-  kernel.setArg(0, (cl_int)nrow);
-  kernel.setArg(1, (cl_int)ncol);
-  kernel.setArg(2, static_cast<const float*>(A->mutable_data()));
-  kernel.setArg(3, static_cast<const float*>(v->mutable_data()));
-  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
 
-  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
 }
 
+
 template<>
 void Outer<float, lang::Opencl>(const size_t m, const size_t n, const Block* lhs, const Block* rhs, Block* out, Context* ctx) {
-  std::string kname = "clkernel_outerproduct";
-  auto kernel = ctx->kernels->at(kname);
-  kernel.setArg(0, (cl_int)m);
-  kernel.setArg(1, (cl_int)n);
-  kernel.setArg(2, static_cast<const float*>(lhs->data()));
-  kernel.setArg(3, static_cast<const float*>(rhs->data()));
-  kernel.setArg(4, static_cast<float*>(out->mutable_data()));
-
-  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n));
+  viennacl::vector<float> lhs_in((const cl_mem)lhs->data(), m);
+  viennacl::vector<float> rhs_in((const cl_mem)rhs->data(), n);
+  viennacl::matrix<float> out_buf(static_cast<cl_mem>(out->mutable_data()), m, n);
+  
+  out_buf = viennacl::linalg::outer_prod(lhs_in, rhs_in);
 }
 
-template<>
-void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
-  std::string kname = "clkernel_sumcol";
-  auto kernel = ctx->kernels->at(kname);
-  kernel.setArg(0, (cl_int)nrow);
-  kernel.setArg(1, (cl_int)ncol);
-  kernel.setArg(2, static_cast<const float*>(in->mutable_data()));
-  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
 
-  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
-}*/
-/*
 template<>
-void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_sumrow";
-  auto kernel = ctx->kernels->at(kname);
-
-  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)nrow);
-  kernel.setArg(1, (cl_int)ncol);
-  kernel.setArg(2, inbuf);
-  kernel.setArg(3, outbuf);
-  kernel.setArg(4, cl::Local(sizeof(float) * nrow * ncol));
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
-}
-*/
-
-
-#define BLOCK_DIM 16
-
-void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_transpose";
-  auto kernel = ctx->kernels->at(kname);
-
-  kernel.setArg(0, (cl_uint)nrow);
-  kernel.setArg(1, (cl_uint)ncol);
-  kernel.setArg(2, in);
-  kernel.setArg(3, out);
-  kernel.setArg(4, cl::Local((BLOCK_DIM + 1) * BLOCK_DIM));
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
-}
-
-#undef BLOCK_DIM
-
-
-/// This is a utility function that transforms a single-row vector into a diagonal matrix.
-/// For example, a vector of size n will become a matrix of size n*n where only the positions nx == ny will have values.
-void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
-
-  std::string kname = "clkernel_diagvec_left";
-  auto kernel = ctx->kernels->at(kname);
+void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
+  viennacl::matrix<float> m_in((const cl_mem)in->data(), nrow, ncol);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), nrow);
   
-  kernel.setArg(0, (cl_uint)size);
-  kernel.setArg(1, in);
-  kernel.setArg(2, out);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+  v_out = viennacl::linalg::column_sum(m_in);
 }
 
-void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
-  cl_int status = CL_SUCCESS;
 
-  std::string kname = "clkernel_diagvec_right";
-  auto kernel = ctx->kernels->at(kname);
-
-  kernel.setArg(0, (cl_uint)size);
-  kernel.setArg(1, in);
-  kernel.setArg(2, out);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size));
-  OCL_CHECK(status, "Failed to enqueue kernel function!");
+template<>
+void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
+  viennacl::matrix<float> m_in((const cl_mem)in->data(), nrow, ncol);
+  viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), ncol);
+  
+  v_out = viennacl::linalg::column_sum(m_in);
 }
+*/
 
 } // namespace singa
 
 #endif // USE_OPENCL
 
-#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_v_in + x;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/utils/opencl_utils.cc
----------------------------------------------------------------------
diff --git a/src/utils/opencl_utils.cc b/src/utils/opencl_utils.cc
deleted file mode 100644
index e4fe69b..0000000
--- a/src/utils/opencl_utils.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/opencl_utils.h"
-
-#ifdef USE_OPENCL
-
-void PrintDeviceInfo(const cl::Device &dev) {
-  cl_int status = CL_SUCCESS;
-
-  LOG(INFO) << "\tDevice type: " << dev.getInfo<CL_DEVICE_TYPE>(&status);
-  LOG(INFO) << "\tUnified memory: " << dev.getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>(&status);
-  LOG(INFO) << "\tClock speed (MHz): " << dev.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(&status);
-  LOG(INFO) << "\tECC memory: " << dev.getInfo<CL_DEVICE_ERROR_CORRECTION_SUPPORT>(&status);
-  LOG(INFO) << "\tLittle endian: " << dev.getInfo<CL_DEVICE_ENDIAN_LITTLE>(&status);
-  LOG(INFO) << "\tCompute units: " << dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&status);
-  LOG(INFO) << "\tMax work grp size: " << dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&status);
-//LOG(INFO) << "\tMax work item size: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(&status);
-  LOG(INFO) << "\tMax item dimension: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>(&status);
-  LOG(INFO) << "\tQueue properties: " << dev.getInfo<CL_DEVICE_QUEUE_PROPERTIES>(&status);
-  LOG(INFO) << "\tExecution capabilities: " << dev.getInfo<CL_DEVICE_EXECUTION_CAPABILITIES>(&status);
-  LOG(INFO) << "\tMax mem alloc size: " << dev.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>(&status);
-  LOG(INFO) << "\tGlobal mem size: " << dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>(&status);
-  LOG(INFO) << "\tLocal mem size: " << dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(&status);
-  LOG(INFO) << "\n";
-
-  OCL_CHECK(status, "Failed to retrieve device information!");
-}
-
-
-void PrintPlatformInfo(const cl::Platform &p) {
-  cl_int status = CL_SUCCESS;
-
-  LOG(INFO) << "\tName: 	 " << p.getInfo<CL_PLATFORM_NAME>(&status);
-  LOG(INFO) << "\tProfile: " << p.getInfo<CL_PLATFORM_PROFILE>(&status);
-  LOG(INFO) << "\tVersion: " << p.getInfo<CL_PLATFORM_VERSION>(&status);
-  LOG(INFO) << "\tVendor:  " << p.getInfo<CL_PLATFORM_VENDOR>(&status);
-  LOG(INFO) << "\tExtensions: " << p.getInfo<CL_PLATFORM_EXTENSIONS>(&status);
-  LOG(INFO) << "\n";
-
-  OCL_CHECK(status, "Failed to retrieve platform information!");
-}
-
-
-#endif // USE_OPENCL