You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/08/31 04:37:02 UTC
[2/4] incubator-singa git commit: SINGA-243 ViennaCL backend for
OpenCL support
SINGA-243 ViennaCL backend for OpenCL support
- Replaced current OpenCL backend with ViennaCL libs
- Removed need for OpenCL C++ headers
- Updated unit tests files to match changes
- Updated Dependencies.cmake to match changes
- Added license information.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/595302a3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/595302a3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/595302a3
Branch: refs/heads/master
Commit: 595302a36e9fee2ef63b57ddf81fe75a5fe00a40
Parents: 65bf582
Author: Tan Li Boon <un...@users.noreply.github.com>
Authored: Wed Aug 17 12:48:18 2016 +0800
Committer: Tan Li Boon <ta...@u.nus.edu>
Committed: Wed Aug 31 10:53:21 2016 +0800
----------------------------------------------------------------------
.travis.yml | 6 +-
CMakeLists.txt | 3 +-
LICENSE | 4 +-
cmake/Dependencies.cmake | 25 +-
cmake/Thirdparty/FindViennaCL.cmake | 45 ++
include/singa/core/common.h | 10 +-
include/singa/core/device.h | 56 +-
include/singa/utils/opencl_utils.h | 142 +---
src/core/device/opencl_device.cc | 182 ++---
src/core/tensor/tensor_math_opencl.cl | 19 +-
src/core/tensor/tensor_math_opencl.h | 1007 ++++++++--------------------
src/utils/opencl_utils.cc | 63 --
test/singa/test_opencl.cc | 700 ++++++++-----------
test/singa/test_opencl_device.cc | 108 +++
14 files changed, 822 insertions(+), 1548 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/.travis.yml
----------------------------------------------------------------------
diff --git a/.travis.yml b/.travis.yml
index 8b1f89c..d9bba51 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,13 +7,13 @@ before_install:
- sudo apt-get -qq update
- sudo apt-get install -qq -y libopenblas-dev libgoogle-glog-dev libprotobuf-dev protobuf-compiler
- sudo apt-get install -qq -y opencl-headers ocl-icd-*
- - wget https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.9/cl2.hpp
- - sudo mv cl2.hpp /usr/include/CL/
+#- wget https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.9/cl2.hpp
+#- sudo mv cl2.hpp /usr/include/CL/
#- sudo apt-get install -qq libgtest-dev
before_script:
- mkdir build && cd build
- - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF -DBUILD_OPENCL_TESTS=OFF
+ - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF
script:
- make
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3f6bea2..611cee4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,9 +43,8 @@ OPTION(USE_CUDNN "Use Cudnn libs" ON)
OPTION(USE_OPENCV "Use opencv" OFF)
OPTION(USE_LMDB "Use LMDB libs" OFF)
OPTION(USE_PYTHON "Generate py wrappers" OFF)
-#OPTION(USE_OPENCL "Use OpenCL" OFF)
+OPTION(USE_OPENCL "Use OpenCL" OFF)
OPTION(ENABLE_DIST "enable distributed training" OFF)
-#OPTION(BUILD_OPENCL_TESTS "Build OpenCL tests" OFF)
INCLUDE("cmake/Dependencies.cmake")
INCLUDE("cmake/Utils.cmake")
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/LICENSE
----------------------------------------------------------------------
diff --git a/LICENSE b/LICENSE
index f658def..4f9d1e7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -303,7 +303,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
===========================================================================
SINGA bundles the following under BSD 2-clause license:
-include/singa/utils/cuda_utils.h, src/core/tensor/distribution.cl
+include/singa/utils/cuda_utils.h
+src/core/tensor/distribution.cl
+cmake/ThirdParty/FindViennaCL.cmake
COPYRIGHT
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index b5fda6d..aa2212b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -50,23 +50,26 @@ IF(USE_CBLAS)
FIND_PACKAGE(CBLAS REQUIRED)
INCLUDE_DIRECTORIES(SYSTEM ${CBLAS_INCLUDE_DIR})
LIST(APPEND SINGA_LINKER_LIBS ${CBLAS_LIBRARIES})
- MESSAGE(STATUS "FOUND cblas at ${CBLAS_LIBRARIES}")
+ MESSAGE(STATUS "Found cblas at ${CBLAS_LIBRARIES}")
ENDIF()
IF(USE_OPENCL)
FIND_PACKAGE(OpenCL REQUIRED)
- IF(NOT OPENCL_FOUND)
+ IF(NOT OpenCL_FOUND)
MESSAGE(SEND_ERROR "OpenCL was requested, but not found.")
ELSE()
- INCLUDE_DIRECTORIES(SYSTEM ${OpenCL_INCPATH})
- LIST(APPEND SINGA_LINKER_LIBS ${OPENCL_LIBRARIES})
- MESSAGE(STATUS "Found OpenCL at ${OPENCL_INCLUDE_DIRS}")
- IF(NOT OPENCL_HAS_CPP_BINDINGS)
- MESSAGE(SEND_ERROR "OpenCL C++ bindings cl2.hpp was not found.")
- ELSE()
- MESSAGE(STATUS "Found OpenCL C++ bindings.")
- ENDIF()
- ENDIF()
+ MESSAGE(STATUS "Found OpenCL headers at ${OpenCL_INCLUDE_DIRS}")
+ INCLUDE_DIRECTORIES(SYSTEM ${OpenCL_INCLUDE_DIR})
+ LIST(APPEND SINGA_LINKER_LIBS ${OpenCL_LIBRARIES})
+ FIND_PACKAGE(ViennaCL REQUIRED)
+ IF(NOT ViennaCL_FOUND)
+ MESSAGE(SEND_ERROR "ViennaCL is required if OpenCL is enabled.")
+ ELSE()
+ MESSAGE(STATUS "Found ViennaCL headers at ${ViennaCL_INCLUDE_DIR}")
+ INCLUDE_DIRECTORIES(SYSTEM ${ViennaCL_INCLUDE_DIR})
+ LIST(APPEND SINGA_LINKER_LIBS ${ViennaCL_LIBRARIES})
+ ENDIF()
+ ENDIF()
ENDIF()
FIND_PACKAGE(Glog REQUIRED)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/cmake/Thirdparty/FindViennaCL.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindViennaCL.cmake b/cmake/Thirdparty/FindViennaCL.cmake
new file mode 100644
index 0000000..263c80f
--- /dev/null
+++ b/cmake/Thirdparty/FindViennaCL.cmake
@@ -0,0 +1,45 @@
+# This file is retrieved from caffe/cmake/Modules/FindViennaCL.cmake.
+
+SET(ViennaCL_WITH_OPENCL TRUE)
+
+SET(VIENNACL_INCLUDE_SEARCH_PATHS
+ ..
+ /usr/include
+ /usr/local/include
+ /opt/ViennaCL/include
+ $ENV{VIENNACL_HOME}
+ $ENV{VIENNACL_HOME}/include
+)
+
+FIND_PATH(ViennaCL_INCLUDE_DIR NAMES viennacl/forwards.h PATHS ${VIENNACL_INCLUDE_SEARCH_PATHS})
+
+SET(ViennaCL_FOUND ON)
+
+# Check include files
+IF(NOT ViennaCL_INCLUDE_DIR)
+ SET(ViennaCL_FOUND OFF)
+ MESSAGE(STATUS "Could not find ViennaCL include. Turning ViennaCL_FOUND off")
+ENDIF()
+
+IF (ViennaCL_FOUND)
+ IF (NOT ViennaCL_FIND_QUIETLY)
+ MESSAGE(STATUS "Found ViennaCL include: ${ViennaCL_INCLUDE_DIR}")
+ ENDIF (NOT ViennaCL_FIND_QUIETLY)
+ELSE (ViennaCL_FOUND)
+ IF (ViennaCL_FIND_REQUIRED)
+ MESSAGE(FATAL_ERROR "Could not find ViennaCL")
+ ENDIF (ViennaCL_FIND_REQUIRED)
+ENDIF (ViennaCL_FOUND)
+
+IF(ViennaCL_WITH_OPENCL)
+ find_package(OpenCL REQUIRED)
+ENDIF(ViennaCL_WITH_OPENCL)
+
+set(ViennaCL_INCLUDE_DIRS ${ViennaCL_INCLUDE_DIR} ${OPENCL_INCLUDE_DIRS})
+set(ViennaCL_LIBRARIES ${OPENCL_LIBRARIES})
+
+MARK_AS_ADVANCED(
+ ViennaCL_INCLUDE_DIR
+ ViennaCL_INCLUDE_DIRS
+ ViennaCL_LIBRARIES
+)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 53a9726..dc552c1 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -36,10 +36,7 @@
#ifdef USE_OPENCL
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#include <CL/cl2.hpp>
-#include <unordered_map>
+#include "singa/utils/opencl_utils.h"
#endif // USE_OPENCL
using std::atomic;
@@ -110,9 +107,8 @@ typedef struct _Context {
#endif // USE_CUDA
#ifdef USE_OPENCL
- std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
- cl::CommandQueue ocl_cmdq;
- cl::Context ocl_ctx;
+ // This stores the context ID of the OpenCL context controlled by ViennaCL.
+ long vcl_ctx_id;
#endif
} Context;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 810d41f..62fa250 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -36,12 +36,6 @@
#endif // USE_CUDA
#ifdef USE_OPENCL
-// http://github.khronos.org/OpenCL-CLHPP/
-// cl2.hpp includes cl.h, do not re-include.
-#define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#define CL_HPP_TARGET_OPENCL_VERSION 120
-#include <unordered_map>
-#include <CL/cl2.hpp>
#include "singa/utils/opencl_utils.h"
#endif // USE_OPENCL
@@ -217,50 +211,26 @@ public:
OpenclDevice(int id = 0, int num_executors = 1);
~OpenclDevice();
- /// Get the specified kernel.
- cl::Kernel GetKernel(const std::string& kname, cl_int* status = nullptr);
-
- /// Get the command queue associated with this device.
- cl::CommandQueue GetCmdQ() { return cmdq; }
-
- /// Prints information about all Devices in each Platform.
- void PrintAllDeviceInfo();
-
- /// Prints status about CL source code builds.
- void PrintClBuildInfo(cl::Program &p);
-
// Overridden, inherited methods
void SetRandSeed(unsigned seed) override;
void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
CopyDirection direction, int dst_offset = 0,
int src_offset = 0);
-/*
- void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes = 0,
- size_t dst_offset = 0) override;*/
protected:
/// The OpenCL device that this object represents.
/// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
/// object.
- cl::Device this_device;
+ viennacl::ocl::device this_device;
/// Each OpenclDevice has one OpenCL context. It is created along with the
/// creation of this object.
- cl::Context ocl_ctx;
-
- /// The CommandQueue that is associated with this device.
- /// Since each OpenclDevice contains only one cl::Device and one cl::Context,
- /// it naturally also contains one cl::CommandQueue that is associated
- /// with said Device and Context.
- cl::CommandQueue cmdq;
-
- /// A list of kernels that has been compiled on this device.
- std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
+ viennacl::ocl::context vcl_ctx;
/// Searches the given paths for all .cl files and builds
/// OpenCL programs, then stores them in the Kernels map.
- void BuildPrograms(const std::string &kdir = cl_src_path);
+ void BuildPrograms(const std::string &kdir);
// Overridden, inherited methods.
@@ -280,21 +250,6 @@ protected:
private:
- /// Copies a data block from host to device.
- /// src: a pointer to an array of data.
- /// dst: a pointer to a cl::Buffer object.
- void WriteToDevice(cl::Buffer* dst, const void* src, const size_t size);
-
- /// Reads a data block from device to host.
- /// src: a pointer to an cl::Buffer object.
- /// dst: a pointer to an malloc'ed empty array.
- void ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size);
-
- /// Duplicates a block of data on the device.
- /// src: a pointer to the original cl::Buffer object.
- /// dst: a pointer to the new cl::Buffer object to copy the data into.
- void CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size);
-
static const std::string cl_src_path;
};
#endif // USE_OPENCL
@@ -368,11 +323,6 @@ public:
/// except the context initialization.
static bool CheckDevice(const int device_id);
-
-private:
-#ifdef USE_OPENCL
- cl::Platform clPlatform;
-#endif // USE_OPENCL
};
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/utils/opencl_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/opencl_utils.h b/include/singa/utils/opencl_utils.h
index 664a9e1..8c05643 100644
--- a/include/singa/utils/opencl_utils.h
+++ b/include/singa/utils/opencl_utils.h
@@ -24,121 +24,47 @@
#ifdef USE_OPENCL
-#include <iostream>
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-// http://github.khronos.org/OpenCL-CLHPP/
-// cl2.hpp includes cl.h, do not re-include.
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
-#include <CL/cl2.hpp>
-#define CL_BREAK_ON_FAILURE if (status != CL_SUCCESS) return;
-
-
-inline const char* clGetBuildInfoString(const cl_build_status status) {
- switch (status) {
- case CL_BUILD_NONE: return "CL_BUILD_NONE";
- case CL_BUILD_ERROR: return "CL_BUILD_ERROR";
- case CL_BUILD_SUCCESS: return "CL_BUILD_SUCCESS";
- case CL_BUILD_IN_PROGRESS: return "CL_BUILD_IN_PROGRESS";
- default: return "";
- }
-}
-
-
-inline const char* clGetErrorString(const cl_int status) {
-
- switch(status) {
-
- // Run-time and JIT compiler errors
- case 0: return "CL_SUCCESS";
- case -1: return "CL_DEVICE_NOT_FOUND";
- case -2: return "CL_DEVICE_NOT_AVAILABLE";
- case -3: return "CL_COMPILER_NOT_AVAILABLE";
- case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
- case -5: return "CL_OUT_OF_RESOURCES";
- case -6: return "CL_OUT_OF_HOST_MEMORY";
- case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
- case -8: return "CL_MEM_COPY_OVERLAP";
- case -9: return "CL_IMAGE_FORMAT_MISMATCH";
- case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
- case -11: return "CL_BUILD_PROGRAM_FAILURE";
- case -12: return "CL_MAP_FAILURE";
- case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
- case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
- case -15: return "CL_COMPILE_PROGRAM_FAILURE";
- case -16: return "CL_LINKER_NOT_AVAILABLE";
- case -17: return "CL_LINK_PROGRAM_FAILURE";
- case -18: return "CL_DEVICE_PARTITION_FAILED";
- case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-
- // Compile-time errors
- case -30: return "CL_INVALID_VALUE";
- case -31: return "CL_INVALID_DEVICE_TYPE";
- case -32: return "CL_INVALID_PLATFORM";
- case -33: return "CL_INVALID_DEVICE";
- case -34: return "CL_INVALID_CONTEXT";
- case -35: return "CL_INVALID_QUEUE_PROPERTIES";
- case -36: return "CL_INVALID_COMMAND_QUEUE";
- case -37: return "CL_INVALID_HOST_PTR";
- case -38: return "CL_INVALID_MEM_OBJECT";
- case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
- case -40: return "CL_INVALID_IMAGE_SIZE";
- case -41: return "CL_INVALID_SAMPLER";
- case -42: return "CL_INVALID_BINARY";
- case -43: return "CL_INVALID_BUILD_OPTIONS";
- case -44: return "CL_INVALID_PROGRAM";
- case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
- case -46: return "CL_INVALID_KERNEL_NAME";
- case -47: return "CL_INVALID_KERNEL_DEFINITION";
- case -48: return "CL_INVALID_KERNEL";
- case -49: return "CL_INVALID_ARG_INDEX";
- case -50: return "CL_INVALID_ARG_VALUE";
- case -51: return "CL_INVALID_ARG_SIZE";
- case -52: return "CL_INVALID_KERNEL_ARGS";
- case -53: return "CL_INVALID_WORK_DIMENSION";
- case -54: return "CL_INVALID_WORK_GROUP_SIZE";
- case -55: return "CL_INVALID_WORK_ITEM_SIZE";
- case -56: return "CL_INVALID_GLOBAL_OFFSET";
- case -57: return "CL_INVALID_EVENT_WAIT_LIST";
- case -58: return "CL_INVALID_EVENT";
- case -59: return "CL_INVALID_OPERATION";
- case -60: return "CL_INVALID_GL_OBJECT";
- case -61: return "CL_INVALID_BUFFER_SIZE";
- case -62: return "CL_INVALID_MIP_LEVEL";
- case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
- case -64: return "CL_INVALID_PROPERTY";
- case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
- case -66: return "CL_INVALID_COMPILER_OPTIONS";
- case -67: return "CL_INVALID_LINKER_OPTIONS";
- case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
-
- // Extension errors
- case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
- case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
- case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
- case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
- case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
- case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
-
- default: return "Unknown OpenCL status";
+#ifndef VIENNACL_WITH_OPENCL
+ #define VIENNACL_WITH_OPENCL
+#endif
+
+#ifndef __APPLE__
+ #include "CL/cl.h"
+#else
+ #include "OpenCL/cl.h"
+#endif
+
+#include <viennacl/backend/opencl.hpp>
+
+#include <viennacl/ocl/device.hpp>
+#include <viennacl/ocl/platform.hpp>
+#include <viennacl/ocl/backend.hpp>
+#include <viennacl/ocl/device_utils.hpp>
+#include <viennacl/ocl/utils.hpp>
+#include <viennacl/ocl/program.hpp>
+#include <viennacl/ocl/kernel.hpp>
+
+
+inline viennacl::ocl::handle<cl_mem>
+WrapHandle(cl_mem in, viennacl::ocl::context *ctx) {
+ if (in != nullptr) {
+ viennacl::ocl::handle<cl_mem> memhandle(in, *ctx);
+ memhandle.inc();
+ return memhandle;
+ } else {
+ cl_int err;
+ cl_mem dummy = clCreateBuffer(ctx->handle().get(), CL_MEM_READ_WRITE, 0,
+ nullptr, &err);
+ viennacl::ocl::handle<cl_mem> memhandle(dummy, *ctx);
+ return memhandle;
}
}
-
-/// Special function used to perform error checking and logging.
-inline bool OCL_CHECK(const cl_int status, const char* what) {
- if (status == CL_SUCCESS) return true; // Nothing wrong.
- LOG(ERROR) << status << ": " << clGetErrorString(status) << " " << what << std::endl;
- return false;
-}
-
-/// Prints information about the specified Platform.
-void PrintPlatformInfo(const cl::Platform &p);
-
-/// Prints information about the specified Device.
-void PrintDeviceInfo(const cl::Device &dev);
-
#endif // USE_OPENCL
#endif // SINGA_UTILS_OPENCL_UTILS_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/device/opencl_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc
index b941cd2..6b371c4 100644
--- a/src/core/device/opencl_device.cc
+++ b/src/core/device/opencl_device.cc
@@ -23,89 +23,41 @@
#include "singa/core/device.h"
#include "singa/utils/tinydir.h"
+#include "singa/utils/opencl_utils.h"
#ifdef USE_OPENCL
-using std::string;
+using namespace viennacl;
+using namespace viennacl::backend::opencl;
namespace singa {
-const string OpenclDevice::cl_src_path = "../src/core/tensor";
+const std::string OpenclDevice::cl_src_path = "../src/core/tensor";
OpenclDevice::OpenclDevice(int id, int num_executors)
: Device(id, num_executors) {
+ CHECK_GE(id, 0);
lang_ = kOpencl;
- this->kernels = std::make_shared<std::unordered_map<string, cl::Kernel>>();
-
- // Create the OpenCL Device, Context, and CommandQueue.
- /// TODO: This merely chooses the first device on the first platform.
- cl_int status = CL_SUCCESS;
-
- std::vector<cl::Platform> platforms;
- status = cl::Platform::get(&platforms);
- OCL_CHECK(status, "Failed to find any OpenCL platforms!");
-
- std::vector<cl::Device> devices;
- status = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
- OCL_CHECK(status, "Failed to get list of devices from platform!");
-
- this->this_device = cl::Device(devices[0]);
- this->ocl_ctx = cl::Context(this_device, nullptr, nullptr, nullptr, &status);
- OCL_CHECK(status, "Failed to create context!");
-
- this->cmdq = cl::CommandQueue(ocl_ctx, this_device, CL_QUEUE_PROFILING_ENABLE, &status);
- OCL_CHECK(status, "Failed to create a command queue!");
-
- BuildPrograms();
-
- ctx_.kernels = kernels;
- ctx_.ocl_cmdq = cmdq;
- ctx_.ocl_ctx = ocl_ctx;
+
+ ocl::current_context().build_options("-cl-std=CL1.2");
+
+ ctx_.vcl_ctx_id = 0;
+ this->this_device = ocl::current_device();
+
+ BuildPrograms(cl_src_path);
}
OpenclDevice::~OpenclDevice() {
// Flush and finish the command queue.
+ auto cmdq = ocl::current_context().get_queue();
+
cmdq.flush();
cmdq.finish();
}
-cl::Kernel OpenclDevice::GetKernel(const std::string& kname, cl_int* status) {
- if (!status) *status = CL_SUCCESS;
- if (kernels->find(kname) == kernels->end()) {
- // TODO: Not found
- LOG(ERROR) << "Error: Kernel " << kname << " could not be found!";
- if (!status) *status = CL_INVALID_KERNEL;
- }
- return kernels->at(kname);
-}
-
-/*
-void OpenclDevice::PrintAllDeviceInfo() {
- cl_int status = CL_SUCCESS;
-
- for (auto dev : devices) {
- PrintDeviceInfo(d);
- }
-}
-*/
-
-
-void OpenclDevice::PrintClBuildInfo(cl::Program &p) {
- cl_int status = CL_SUCCESS;
-
- auto buildStatus = p.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(&status);
- for (auto pair : buildStatus)
- std::cout << clGetBuildInfoString(pair.second) << std::endl;
-
- auto buildLog = p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&status);
- for (auto pair : buildLog)
- std::cout << pair.second << std::endl;
-}
-
-
void OpenclDevice::SetRandSeed(unsigned seed) { seed = seed; }
@@ -113,19 +65,33 @@ void OpenclDevice::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
CopyDirection direction, int dst_offset, int src_offset) {
// Pointers must be valid.
if (!dst || !src) return;
+
+ auto ocl_ctx = viennacl::ocl::get_context(ctx_.vcl_ctx_id);
- CopyToFrom(dst->mutable_data(), src->data(), nBytes, direction);
+ switch(direction) {
+ case kHostToDevice: {
+ auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), &ocl_ctx);
+ memory_write(dst_handle, dst_offset, nBytes, src->data());
+ return;
+ }
+ case kDeviceToHost: {
+ auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), &ocl_ctx);
+ memory_read(src_handle, src_offset, nBytes, dst->mutable_data());
+ return;
+ }
+ case kDeviceToDevice: {
+ auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), &ocl_ctx);
+ auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), &ocl_ctx);
+ memory_copy(src_handle, dst_handle, src_offset, dst_offset, nBytes);
+ return;
+ }
+ default:
+ return;
+ }
}
-/*
-void OpenclDevice::CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes, size_t dst_offset) {
- CopyToFrom(dst->mutable_data(), src, 4, kHostToDevice);
-}
-*/
void OpenclDevice::BuildPrograms(const std::string &kdir) {
- cl_int status = CL_SUCCESS;
-
tinydir_dir dir;
tinydir_open(&dir, kdir.c_str());
@@ -137,63 +103,47 @@ void OpenclDevice::BuildPrograms(const std::string &kdir) {
tinydir_next(&dir);
continue;
}
-
+
std::ifstream clFile(file.path, std::ios_base::binary);
std::stringstream buffer;
buffer << clFile.rdbuf();
std::string clSrc(buffer.str());
- cl::Program program(this->ocl_ctx, clSrc, false, &status);
- OCL_CHECK(status, "Program creation failed.");
- status = program.build({this_device}, "-cl-std=CL1.2");
- if (status == CL_SUCCESS) {
- std::vector<cl::Kernel> built_kernels;
- status = program.createKernels(&built_kernels);
- OCL_CHECK(status, "Failed to create kernels in built program.");
-
- for (auto k : built_kernels) {
- std::string name = k.getInfo<CL_KERNEL_FUNCTION_NAME>(&status);
- this->kernels->insert(std::make_pair(name, k));
- }
- } else {
- OCL_CHECK(status, "Build failed on source path");
- LOG(ERROR) << file.path << std::endl;
- PrintClBuildInfo(program);
- }
+ std::string name(file.name);
+ ocl::current_context().add_program(clSrc, name);
tinydir_next(&dir);
}
}
-// Device IO functions.
-// TODO:
-// Research - MapBuffers can improve performance when the device uses shared memory
-// but is more complex to understand. http://stackoverflow.com/questions/22057692/whats-the-difference-between-clenqueuemapbuffer-and-clenqueuewritebuffer
-// Intel graphics (and possibly AMD APUs) should use MapBuffers?
-// https://software.intel.com/en-us/articles/getting-the-most-from-opencl-12-how-to-increase-performance-by-minimizing-buffer-copies-on-intel-processor-graphics
-
void OpenclDevice::DoExec(function<void(Context*)>&& fn, int executor) {
fn(&ctx_);
}
-// NOTE: ASSUMES dst AND/OR src POINTERS CAN BE CAST TO cl::Buffer POINTERS!
+
void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
CopyDirection direction, Context* ctx) {
// Pointers must be valid.
if (!dst || !src) return;
+
+ auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
switch(direction) {
case kHostToDevice: {
- WriteToDevice(static_cast<cl::Buffer*>(dst), src, nBytes);
+ auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx);
+ memory_write(dst_handle, 0, nBytes, src);
return;
}
case kDeviceToHost: {
- ReadFromDevice(dst, static_cast<const cl::Buffer*>(src), nBytes);
+ auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx);
+ memory_read(src_handle, 0, nBytes, dst);
return;
}
case kDeviceToDevice: {
- CopyDeviceBuffer(static_cast<cl::Buffer*>(dst), static_cast<const cl::Buffer*>(src), nBytes);
+ auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx);
+ auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx);
+ memory_copy(src_handle, dst_handle, 0, 0, nBytes);
return;
}
default:
@@ -203,10 +153,7 @@ void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
void* OpenclDevice::Malloc(int size) {
- cl_int status = CL_SUCCESS;
-
- cl::Buffer* buffer = new cl::Buffer(ocl_ctx, CL_MEM_READ_WRITE, size, nullptr, &status);
- OCL_CHECK(status, "Unable to allocate memory in OpenCL device.");
+ cl_mem buffer = memory_create(ocl::current_context(), size, nullptr);
return static_cast<void*>(buffer);
}
@@ -214,33 +161,8 @@ void* OpenclDevice::Malloc(int size) {
void OpenclDevice::Free(void* p) {
if (!p) return;
- cl::Buffer* buffer = static_cast<cl::Buffer*>(p);
- delete buffer;
-}
-
-
-void OpenclDevice::WriteToDevice(cl::Buffer* dst, const void* src, const size_t size) {
- cl_int status = CL_SUCCESS;
-
- status = cmdq.enqueueWriteBuffer(*dst, CL_TRUE, 0, size, src);
- OCL_CHECK(status, "Unable to write data to OpenCL device.");
-}
-
-
-void OpenclDevice::ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size) {
- cl_int status = CL_SUCCESS;
-
- status = cmdq.enqueueReadBuffer(*src, CL_TRUE, 0, size, dst);
- OCL_CHECK(status, "Unable to read data from OpenCL device.");
-}
-
-
-// dst: cl::Buffer pointer src: cl::Buffer pointer
-void OpenclDevice::CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size) {
- cl_int status = CL_SUCCESS;
-
- status = cmdq.enqueueCopyBuffer(*src, *dst, 0, 0, size);
- OCL_CHECK(status, "Unable to copy buffer in OpenCL device.");
+ cl_mem buffer = static_cast<cl_mem>(p);
+ clReleaseMemObject(buffer);
}
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/tensor/tensor_math_opencl.cl
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.cl b/src/core/tensor/tensor_math_opencl.cl
index f9cf96e..7b89970 100644
--- a/src/core/tensor/tensor_math_opencl.cl
+++ b/src/core/tensor/tensor_math_opencl.cl
@@ -24,7 +24,7 @@
// This reduction code is serial reduction modified from AMD's example.
// http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
__kernel
-void clkernel_abs(const int num, __global const float* in, __global float* out) {
+void clkernel_fabs(const int num, __global const float* in, __global float* out) {
const int i = get_global_id(0);
if (i >= num) return;
out[i] = fabs(in[i]);
@@ -462,7 +462,7 @@ void clkernel_crossentropy(const uint batchsize, const uint dim,
int truth_idx = t[gidx];
if (truth_idx <= 0) return;
- float prob_of_truth = p[gidx + truth_idx];
+ float prob_of_truth = p[gidx * dim + truth_idx];
loss[gidx] = -log(fmax(prob_of_truth, -FLT_MIN));
}
@@ -480,6 +480,21 @@ void clkernel_softmaxentropy(const uint batchsize, const uint dim,
}
+__kernel
+void clkernel_rowmax(const uint nrow, const uint ncol,
+ __global const float* in, __global float* out) {
+ const uint row_id = get_global_id(0);
+ if (row_id >= nrow) return;
+
+ float row_max_val = -FLT_MAX;
+ for (uint i = 0; i < ncol; i++) {
+ row_max_val = fmax(row_max_val, in[row_id * ncol + i]);
+ }
+
+ out[row_id] = row_max_val;
+}
+
+
// **************************************
// Matrix functions
// **************************************
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/tensor/tensor_math_opencl.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h
index c289a56..c387031 100644
--- a/src/core/tensor/tensor_math_opencl.h
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -19,17 +19,27 @@
#ifndef SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
#ifdef USE_OPENCL
-#include <limits>
-#include "singa/utils/opencl_utils.h"
#include "tensor_math.h"
+#include "singa/utils/opencl_utils.h"
-namespace singa {
+#include <viennacl/scalar.hpp>
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include <viennacl/linalg/inner_prod.hpp>
+#include <viennacl/linalg/norm_2.hpp>
+#include <viennacl/linalg/sum.hpp>
+#include <viennacl/linalg/scalar_operations.hpp>
+#include <viennacl/linalg/vector_operations.hpp>
+#include <viennacl/linalg/matrix_operations.hpp>
-// Some forward declarations of utility functions that only exist here.
-void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx);
-void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx);
-void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx);
+#include <viennacl/ocl/kernel.hpp>
+
+using viennacl::ocl::get_context;
+using viennacl::ocl::enqueue;
+
+namespace singa {
// **************************************
// Element-wise functions
@@ -37,436 +47,250 @@ void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context*
template<>
void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_abs";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_fabs");
+
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+
+ v_out = v_in;
+ enqueue(kernel((cl_int)num, v_in, v_out));
}
template<>
void Add<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_add_scalar";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, x);
- kernel.setArg(2, inbuf);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+
+ viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+
+ v_out = v_in + x_in;
}
template<>
void Add<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_add";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
- cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, in1buf);
- kernel.setArg(2, in2buf);
- kernel.setArg(3, outbuf);
+ viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+ viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = v_in1 + v_in2;
}
template<>
-void Clamp<float, lang::Opencl>(const size_t num, const float low, const float high, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_clamp";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, low);
- kernel.setArg(2, high);
- kernel.setArg(3, inbuf);
- kernel.setArg(4, outbuf);
+void Clamp<float, lang::Opencl>(const size_t num, const float low, const float high,
+ const Block* in, Block* out, Context* ctx) {
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_clamp");
+
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ enqueue(kernel((cl_int)num, low, high, v_in, v_out));
}
template<>
void Div<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_divide_scalar_matx";
- auto kernel = ctx->kernels->at(kname);
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+ viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, x);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_div(v_in, x_in);
}
template<>
void Div<float, lang::Opencl>(const size_t num, const float x, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_divide_scalar_xmat";
- auto kernel = ctx->kernels->at(kname);
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, x);
- kernel.setArg(2, inbuf);
- kernel.setArg(3, outbuf);
+ viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_div(x_in, v_in);
}
template<>
void Div<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_divide";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
- cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, in1buf);
- kernel.setArg(2, in2buf);
- kernel.setArg(3, outbuf);
+ viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+ viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_div(v_in1, v_in2);
}
template<>
void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_eltmult_scalar";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, x);
- kernel.setArg(2, inbuf);
- kernel.setArg(3, outbuf);
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+
+ viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_prod(v_in, x_in);
}
template<>
void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
+ viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+ viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- std::string kname = "clkernel_eltmult";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
- cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, in1buf);
- kernel.setArg(2, in2buf);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_prod(v_in1, v_in2);
}
template<>
void Exp<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_exp";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+
+ v_out = viennacl::linalg::element_exp(v_in);
}
template<>
void LE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_le";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, x);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_le");
+
+ viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+ viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+
+ enqueue(kernel((cl_int)num, in_buf, x, out_buf));
}
template<>
void Log<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_log";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+
+ v_out = viennacl::linalg::element_log(v_in);
}
template<>
void LT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_lt";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, x);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_lt");
+
+ viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+ viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+
+ enqueue(kernel((cl_int)num, in_buf, x, out_buf));
}
template<>
void GE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_ge";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, x);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_ge");
+
+ viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+ viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+
+ enqueue(kernel((cl_int)num, in_buf, x, out_buf));
}
template<>
void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_gt";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, x);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_gt");
+
+ viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+ viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+
+ enqueue(kernel((cl_int)num, in_buf, x, out_buf));
}
template<>
void Pow<float, lang::Opencl>(const size_t num, const Block* in, float x, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_pow_scalar";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, x);
- kernel.setArg(2, inbuf);
- kernel.setArg(3, outbuf);
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+
+ viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_pow(v_in, x_in);
}
template<>
void Pow<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_pow";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
- cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, in1buf);
- kernel.setArg(2, in2buf);
- kernel.setArg(3, outbuf);
+ viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+ viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_pow(v_in1, v_in2);
}
template<>
void ReLU<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_relu";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_relu");
+
+ viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+ viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+
+ enqueue(kernel((cl_int)num, in_buf, out_buf));
}
+
template<>
void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
- std::string kname = "clkernel_set";
- auto kernel = ctx->kernels->at(kname);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, x);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::scalar_vector<float>(num, x, ocl_ctx);
}
template<>
void Sigmoid<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_sigmoid";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+
+ const viennacl::vector<float> zero = viennacl::zero_vector<float>(num, ocl_ctx);
+ const viennacl::vector<float> one = viennacl::scalar_vector<float>(num, 1.0f, ocl_ctx);
+
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+
+ v_out = viennacl::linalg::element_div(one, viennacl::linalg::element_exp(zero - v_in) + one);
}
template<>
void Sign<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_sign";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_abs");
+
+ viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
+ viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num);
+
+ enqueue(kernel(num, in_buf, out_buf));
}
template<>
void Sqrt<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_sqrt";
- auto kernel = ctx->kernels->at(kname);
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_sqrt(v_in);
}
@@ -478,168 +302,85 @@ void Square<float, lang::Opencl>(const size_t num, const Block* in, Block* out,
template<>
void Sub<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_subtract_scalar";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, x);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ Add<float, lang::Opencl>(num, in, -x, out, ctx);
}
template<>
void Sub<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
+ viennacl::vector<float> v_in1((const cl_mem)in1->data(), num);
+ viennacl::vector<float> v_in2((const cl_mem)in2->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- std::string kname = "clkernel_subtract";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
- cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, in1buf);
- kernel.setArg(2, in2buf);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = v_in1 - v_in2;
}
template<>
void Sum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
- std::string kname = "clkernel_reduce";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-
- size_t size = sizeof(float) * num;
- cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
- OCL_CHECK(status, "Failed to create buffer!");
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outval);
- kernel.setArg(3, cl::Local(size));
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
-
- float* temp = new float[num];
- status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
- OCL_CHECK(status, "Failed to read from buffer!");
- out[0] = temp[0];
- delete temp;
+ out[0] = viennacl::linalg::sum(v_in);
}
template<>
void Tanh<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_tanh";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+ viennacl::vector<float> v_in((const cl_mem)in->data(), num);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_tanh(v_in);
}
// **************************************
// Random functions
// **************************************
-/// Seed value required for generating distributions.
-static unsigned int seed[4] = {0, 32, 42, 888};
/// Number of generation rounds used in the current algorithm.
static cl_uint rounds = 8;
template<>
void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "PRNG_threefry4x32_bernoulli";
- auto kernel = ctx->kernels->at(kname);
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_bernoulli");
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, outbuf);
- kernel.setArg(1, seed);
- kernel.setArg(2, 0.0f); // inf
- kernel.setArg(3, 1.0f); // sup
- kernel.setArg(4, p); // threshold
- kernel.setArg(5, rounds);
- kernel.setArg(6, cl_uint(num) / 4);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888};
+
+ enqueue(kernel(v_out, seed, 0.0f, 1.0f, p, rounds, cl_uint(num / 4)));
}
template<>
void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const float std, Block* out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "PRNG_threefry4x32_gaussian";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, outbuf);
- kernel.setArg(1, seed);
- kernel.setArg(2, mean); // E
- kernel.setArg(3, std); // V
- kernel.setArg(4, rounds);
- kernel.setArg(5, cl_uint(num) / 4);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_gaussian");
+
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+
+ viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888};
+
+ enqueue(kernel(v_out, seed, mean, std, rounds, cl_uint(num/4)));
}
template<>
void Uniform<float, lang::Opencl>(const size_t num, const float low, const float high, Block* out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "PRNG_threefry4x32_uniform";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_uniform");
- status = kernel.setArg(0, outbuf); OCL_CHECK(status, "kernel arg 0");
- status = kernel.setArg(1, seed); OCL_CHECK(status, "kernel arg 1");
- status = kernel.setArg(2, low); OCL_CHECK(status, "kernel arg 2");
- status = kernel.setArg(3, high); OCL_CHECK(status, "kernel arg 3");
- status = kernel.setArg(4, rounds); OCL_CHECK(status, "kernel arg 4");
- status = kernel.setArg(5, cl_uint(num) / 4); OCL_CHECK(status, "kernel arg 5");
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888};
+
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
+
+ enqueue(kernel(v_out, seed, low, high, rounds, cl_uint(num/4)));
}
// *********************************************************
// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
// *********************************************************
-
+/*
template<>
void Amax<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, Context* ctx) {
cl_int status = CL_SUCCESS;
@@ -699,7 +440,7 @@ void Amin<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, C
delete temp;
}
-
+
template<>
void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
cl_int status = CL_SUCCESS;
@@ -727,256 +468,141 @@ void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Co
out[0] = temp[0];
delete temp;
}
-
-
+*/
+/// out = alpha * in + out
template<>
void Axpy<float, lang::Opencl>(const size_t num, const float alpha, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_axpy";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, alpha);
- kernel.setArg(2, inbuf);
- kernel.setArg(3, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ viennacl::vector<float> inbuf((const cl_mem)in->data(), num);
+ viennacl::vector<float> outbuf(static_cast<cl_mem>(out->mutable_data()), num);
+
+ outbuf += alpha * inbuf;
}
-
+/// out = ||in||_2^2, i.e, L2 norm.
template<>
void Nrm2<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_nrm2";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
-
- size_t size = sizeof(float) * num;
- cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
- OCL_CHECK(status, "Failed to create buffer!");
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, inbuf);
- kernel.setArg(2, outval);
- kernel.setArg(3, cl::Local(sizeof(float) * (std::pow(2, num))));
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
-
- float* temp = new float[num];
- status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
- OCL_CHECK(status, "Failed to read from buffer!");
- out[0] = temp[0];
- delete temp;
+ viennacl::vector<float> inbuf((const cl_mem)in->data(), num);
+
+ out[0] = viennacl::linalg::norm_2(inbuf);
}
template<>
void Scale<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_scale";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, x);
- kernel.setArg(2, outbuf);
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+
+ viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::element_prod(v_out, x_in);
}
template<>
void Dot<float, lang::Opencl>(const size_t num, const Block *in1, const Block *in2, float *out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_dot";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
- cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
-
- size_t size = sizeof(float) * num;
- cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
- OCL_CHECK(status, "Failed to create buffer!");
-
- kernel.setArg(0, (cl_int)num);
- kernel.setArg(1, in1buf);
- kernel.setArg(2, in2buf);
- kernel.setArg(3, outval);
- kernel.setArg(4, cl::Local(size));
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
-
- float* temp = new float[num];
- status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
- OCL_CHECK(status, "Failed to read from buffer!");
- out[0] = temp[0];
- delete temp;
+ viennacl::vector<float> in1_buf((const cl_mem)in1->data(), num);
+ viennacl::vector<float> in2_buf((const cl_mem)in2->data(), num);
+
+ out[0] = viennacl::linalg::inner_prod(in1_buf, in2_buf);
}
-
+/// out = alpha * A * v + beta * out.
template<>
void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, const float alpha,
const Block *A, const Block *v, const float beta, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_gemv";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
- cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)m);
- kernel.setArg(1, (cl_int)n);
- kernel.setArg(2, alpha);
- kernel.setArg(3, Abuf);
- kernel.setArg(4, vbuf);
- kernel.setArg(5, beta);
- kernel.setArg(6, outbuf);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+ viennacl::matrix<float> A_in((const cl_mem)A->data(), m, n);
+ viennacl::vector<float> v_in((const cl_mem)v->data(), trans ? m : n);
+ viennacl::vector<float> o_in(static_cast<cl_mem>(out->mutable_data()), trans ? n : m);
+
+ if (trans) viennacl::trans(A_in);
+
+ o_in *= beta;
+ o_in += alpha * viennacl::linalg::prod(A_in, v_in);
}
+/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+/// if matrix_lef_side is true, do M*v; else do v*M
template<>
void DGMM<float, lang::Opencl>(bool side_right,
const size_t nrow, const size_t ncol,
const Block *M, const Block *v, Block *out, Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- cl::Buffer Mbuf = *(static_cast<cl::Buffer*>(M->mutable_data()));
- cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
- std::string kname;
+ viennacl::matrix<float> M_buf((const cl_mem)M->data(), nrow, ncol);
+ viennacl::vector<float> v_buf((const cl_mem)v->data(), nrow);
+ viennacl::matrix<float> out_buf(static_cast<cl_mem>(out->mutable_data()), nrow, ncol);
+
+ auto diag = viennacl::diag(v_buf);
+
if (side_right) {
- DiagVec_Right(ncol, vbuf, vbuf, ctx);
- kname = "clkernel_dgmm_right";
+ out_buf = viennacl::linalg::prod(diag, M_buf);
} else {
- DiagVec_Left(nrow, vbuf, vbuf, ctx);
- kname = "clkernel_dgmm_left";
+ out_buf = viennacl::linalg::prod(M_buf, diag);
}
-
- auto kernel = ctx->kernels->at(kname);
-
- kernel.setArg(0, (cl_int)nrow);
- kernel.setArg(1, (cl_int)ncol);
- kernel.setArg(2, Mbuf);
- kernel.setArg(3, vbuf);
- kernel.setArg(4, outbuf);
- kernel.setArg(5, cl::Local(sizeof(float) * nrow * ncol));
-
- cl::NDRange global(nrow); // Only nrow because current implementation is 1 dimensional
-// cl::NDRange local();
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
- OCL_CHECK(status, "Failed to enqueue kernel function!");
}
-
+/// C = alpha * A * B + beta * C.
template<>
void GEMM<float, lang::Opencl>(const bool transA, const bool transB,
const size_t nrowA, const size_t ncolB, const size_t ncolA,
const float alpha, const Block *A, const Block *B, const float beta,
Block *C, Context *ctx) {
- cl_int status = CL_SUCCESS;
- std::string kname = "clkernel_gemm";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
- cl::Buffer Bbuf = *(static_cast<cl::Buffer*>(B->mutable_data()));
- cl::Buffer Cbuf = *(static_cast<cl::Buffer*>(C->mutable_data()));
-
- // If matrix A needs to be transposed, do it.
- if (transA)
- Transpose(nrowA, ncolA, Abuf, Abuf, ctx);
-
- // If vector B needs to be transposed, do it.
- if (transB)
- Transpose(nrowA, ncolB, Bbuf, Bbuf, ctx);
-
- kernel.setArg(0, (cl_int)nrowA);
- kernel.setArg(1, (cl_int)ncolB);
- kernel.setArg(2, (cl_int)ncolA);
- kernel.setArg(3, alpha);
- kernel.setArg(4, Abuf);
- kernel.setArg(5, Bbuf);
- kernel.setArg(6, beta);
- kernel.setArg(7, Cbuf);
- kernel.setArg(8, cl::Local(sizeof(float) * nrowA * ncolB));
- kernel.setArg(9, cl::Local(sizeof(float) * nrowA * ncolB));
-
-// TODO: Try to make the work group size a power of 2 given an arbitrary matrix.
- cl::NDRange global(nrowA, ncolB);
- cl::NDRange local(nrowA, ncolB);
+ viennacl::matrix<float> A_buf((const cl_mem)A->data(), nrowA, ncolA);
+ viennacl::matrix<float> B_buf((const cl_mem)B->data(), ncolA, ncolB);
+ viennacl::matrix<float> C_buf(static_cast<cl_mem>(C->mutable_data()), nrowA, ncolB);
+
+ if (transA) viennacl::trans(A_buf);
+ if (transB) viennacl::trans(B_buf);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global, local);
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ C_buf *= beta;
+ C_buf += alpha * viennacl::linalg::prod(A_buf, B_buf);
}
+
template <>
void ComputeCrossEntropy<float, lang::Opencl>(const size_t batchsize, const size_t dim,
const Block *p, const Block *t, Block *loss,
Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_crossentropy";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data()));
- cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data()));
- cl::Buffer lossbuf = *(static_cast<cl::Buffer*>(loss->mutable_data()));
-
- kernel.setArg(0, (cl_uint)batchsize);
- kernel.setArg(1, (cl_uint)dim);
- kernel.setArg(2, pbuf);
- kernel.setArg(3, tbuf);
- kernel.setArg(4, lossbuf);
-
- cl::NDRange global(batchsize);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_crossentropy");
+
+ viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize);
+ viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize);
+ viennacl::vector<float> loss_buf(static_cast<cl_mem>(loss->mutable_data()), batchsize);
+
+ enqueue(kernel((cl_uint)batchsize, (cl_uint)dim, p_buf, t_buf, loss_buf));
}
+
template <>
void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const size_t dim,
const Block *p, const Block *t, Block *grad,
Context *ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_softmaxentropy";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data()));
- cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data()));
- cl::Buffer gradbuf = *(static_cast<cl::Buffer*>(grad->mutable_data()));
-
- kernel.setArg(0, (cl_uint)batchsize);
- kernel.setArg(1, (cl_uint)dim);
- kernel.setArg(2, pbuf);
- kernel.setArg(3, tbuf);
- kernel.setArg(4, gradbuf);
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_softmaxentropy");
+
+ viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize);
+ viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize);
+ viennacl::vector<float> grad_buf(static_cast<cl_mem>(grad->mutable_data()), batchsize);
+
+ enqueue(kernel((cl_uint)batchsize, (cl_uint)dim, p_buf, t_buf, grad_buf));
+}
- cl::NDRange global(batchsize);
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+template<>
+void RowMax<float, lang::Opencl>(const size_t nrow, const size_t ncol,
+ const Block *in, Block *out, Context *ctx) {
+ auto ocl_ctx = get_context(ctx->vcl_ctx_id);
+ auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_rowmax");
+
+// kernel.global_work_size(0, nrow);
+
+ viennacl::matrix<float> in_buf((const cl_mem)in->data(), nrow, ncol);
+ viennacl::vector<float> outbuf(static_cast<cl_mem>(out->mutable_data()), nrow);
+
+ enqueue(kernel((cl_uint)nrow, (cl_uint)ncol, in_buf, outbuf));
}
// **************************************
@@ -985,129 +611,46 @@ void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const s
/*
template<>
void AddCol<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) {
- std::string kname = "clkernel_addcol";
- auto kernel = ctx->kernels->at(kname);
- kernel.setArg(0, (cl_int)nrow);
- kernel.setArg(1, (cl_int)ncol);
- kernel.setArg(2, static_cast<const float*>(A->mutable_data()));
- kernel.setArg(3, static_cast<const float*>(v->mutable_data()));
- kernel.setArg(3, static_cast<float*>(out->mutable_data()));
- ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
}
+
template<>
void AddRow<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) {
- std::string kname = "clkernel_addrow";
- auto kernel = ctx->kernels->at(kname);
- kernel.setArg(0, (cl_int)nrow);
- kernel.setArg(1, (cl_int)ncol);
- kernel.setArg(2, static_cast<const float*>(A->mutable_data()));
- kernel.setArg(3, static_cast<const float*>(v->mutable_data()));
- kernel.setArg(3, static_cast<float*>(out->mutable_data()));
- ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
}
+
template<>
void Outer<float, lang::Opencl>(const size_t m, const size_t n, const Block* lhs, const Block* rhs, Block* out, Context* ctx) {
- std::string kname = "clkernel_outerproduct";
- auto kernel = ctx->kernels->at(kname);
- kernel.setArg(0, (cl_int)m);
- kernel.setArg(1, (cl_int)n);
- kernel.setArg(2, static_cast<const float*>(lhs->data()));
- kernel.setArg(3, static_cast<const float*>(rhs->data()));
- kernel.setArg(4, static_cast<float*>(out->mutable_data()));
-
- ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n));
+ viennacl::vector<float> lhs_in((const cl_mem)lhs->data(), m);
+ viennacl::vector<float> rhs_in((const cl_mem)rhs->data(), n);
+ viennacl::matrix<float> out_buf(static_cast<cl_mem>(out->mutable_data()), m, n);
+
+ out_buf = viennacl::linalg::outer_prod(lhs_in, rhs_in);
}
-template<>
-void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
- std::string kname = "clkernel_sumcol";
- auto kernel = ctx->kernels->at(kname);
- kernel.setArg(0, (cl_int)nrow);
- kernel.setArg(1, (cl_int)ncol);
- kernel.setArg(2, static_cast<const float*>(in->mutable_data()));
- kernel.setArg(3, static_cast<float*>(out->mutable_data()));
- ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
-}*/
-/*
template<>
-void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_sumrow";
- auto kernel = ctx->kernels->at(kname);
-
- cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
- cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
- kernel.setArg(0, (cl_int)nrow);
- kernel.setArg(1, (cl_int)ncol);
- kernel.setArg(2, inbuf);
- kernel.setArg(3, outbuf);
- kernel.setArg(4, cl::Local(sizeof(float) * nrow * ncol));
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
-}
-*/
-
-
-#define BLOCK_DIM 16
-
-void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_transpose";
- auto kernel = ctx->kernels->at(kname);
-
- kernel.setArg(0, (cl_uint)nrow);
- kernel.setArg(1, (cl_uint)ncol);
- kernel.setArg(2, in);
- kernel.setArg(3, out);
- kernel.setArg(4, cl::Local((BLOCK_DIM + 1) * BLOCK_DIM));
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
-}
-
-#undef BLOCK_DIM
-
-
-/// This is a utility function that transforms a single-row vector into a diagonal matrix.
-/// For example, a vector of size n will become a matrix of size n*n where only the positions nx == ny will have values.
-void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
- cl_int status = CL_SUCCESS;
-
- std::string kname = "clkernel_diagvec_left";
- auto kernel = ctx->kernels->at(kname);
+void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
+ viennacl::matrix<float> m_in((const cl_mem)in->data(), nrow, ncol);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), nrow);
- kernel.setArg(0, (cl_uint)size);
- kernel.setArg(1, in);
- kernel.setArg(2, out);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+ v_out = viennacl::linalg::column_sum(m_in);
}
-void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
- cl_int status = CL_SUCCESS;
- std::string kname = "clkernel_diagvec_right";
- auto kernel = ctx->kernels->at(kname);
-
- kernel.setArg(0, (cl_uint)size);
- kernel.setArg(1, in);
- kernel.setArg(2, out);
-
- status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size));
- OCL_CHECK(status, "Failed to enqueue kernel function!");
+template<>
+void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
+ viennacl::matrix<float> m_in((const cl_mem)in->data(), nrow, ncol);
+ viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), ncol);
+
+ v_out = viennacl::linalg::column_sum(m_in);
}
+*/
} // namespace singa
#endif // USE_OPENCL
-#endif // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
+#endif // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_v_in + x;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/utils/opencl_utils.cc
----------------------------------------------------------------------
diff --git a/src/utils/opencl_utils.cc b/src/utils/opencl_utils.cc
deleted file mode 100644
index e4fe69b..0000000
--- a/src/utils/opencl_utils.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements. See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership. The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied. See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/opencl_utils.h"
-
-#ifdef USE_OPENCL
-
-void PrintDeviceInfo(const cl::Device &dev) {
- cl_int status = CL_SUCCESS;
-
- LOG(INFO) << "\tDevice type: " << dev.getInfo<CL_DEVICE_TYPE>(&status);
- LOG(INFO) << "\tUnified memory: " << dev.getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>(&status);
- LOG(INFO) << "\tClock speed (MHz): " << dev.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(&status);
- LOG(INFO) << "\tECC memory: " << dev.getInfo<CL_DEVICE_ERROR_CORRECTION_SUPPORT>(&status);
- LOG(INFO) << "\tLittle endian: " << dev.getInfo<CL_DEVICE_ENDIAN_LITTLE>(&status);
- LOG(INFO) << "\tCompute units: " << dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&status);
- LOG(INFO) << "\tMax work grp size: " << dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&status);
-//LOG(INFO) << "\tMax work item size: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(&status);
- LOG(INFO) << "\tMax item dimension: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>(&status);
- LOG(INFO) << "\tQueue properties: " << dev.getInfo<CL_DEVICE_QUEUE_PROPERTIES>(&status);
- LOG(INFO) << "\tExecution capabilities: " << dev.getInfo<CL_DEVICE_EXECUTION_CAPABILITIES>(&status);
- LOG(INFO) << "\tMax mem alloc size: " << dev.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>(&status);
- LOG(INFO) << "\tGlobal mem size: " << dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>(&status);
- LOG(INFO) << "\tLocal mem size: " << dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(&status);
- LOG(INFO) << "\n";
-
- OCL_CHECK(status, "Failed to retrieve device information!");
-}
-
-
-void PrintPlatformInfo(const cl::Platform &p) {
- cl_int status = CL_SUCCESS;
-
- LOG(INFO) << "\tName: " << p.getInfo<CL_PLATFORM_NAME>(&status);
- LOG(INFO) << "\tProfile: " << p.getInfo<CL_PLATFORM_PROFILE>(&status);
- LOG(INFO) << "\tVersion: " << p.getInfo<CL_PLATFORM_VERSION>(&status);
- LOG(INFO) << "\tVendor: " << p.getInfo<CL_PLATFORM_VENDOR>(&status);
- LOG(INFO) << "\tExtensions: " << p.getInfo<CL_PLATFORM_EXTENSIONS>(&status);
- LOG(INFO) << "\n";
-
- OCL_CHECK(status, "Failed to retrieve platform information!");
-}
-
-
-#endif // USE_OPENCL