You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/06/24 06:51:39 UTC
[6/6] incubator-singa git commit: Merge PR #165 for CnMeM
Merge PR #165 for CnMeM
Fixbugs from device type (Device* -> std::shared_ptr<Device>).
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/dd08f413
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/dd08f413
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/dd08f413
Branch: refs/heads/dev
Commit: dd08f413015878365fed32e579c1b7f4ecc81270
Parents: 5651383 9abd791
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Jun 24 13:41:02 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Fri Jun 24 14:49:56 2016 +0800
----------------------------------------------------------------------
.gitignore | 1 +
CMakeLists.txt | 9 +-
cmake/Dependencies.cmake | 12 +
cmake/Templates/singa_config.h.in | 2 +
cmake/Utils.cmake | 15 +
include/singa/core/common.h | 32 +-
include/singa/core/device.h | 18 +-
include/singa/core/memory.h | 62 +-
include/singa/core/tensor.h | 464 +++++++-------
include/singa/io/decoder.h | 56 ++
include/singa/io/encoder.h | 61 ++
include/singa/io/reader.h | 99 +++
include/singa/io/writer.h | 112 ++++
include/singa/model/initializer.h | 105 ++++
include/singa/model/layer.h | 48 +-
include/singa/model/loss.h | 47 ++
include/singa/model/optimizer.h | 59 +-
include/singa/utils/channel.h | 85 +++
include/singa/utils/timer.h | 2 +-
src/CMakeLists.txt | 18 +
src/core/device/cpp_cpu.cc | 2 +-
src/core/device/cuda_gpu.cc | 88 ++-
src/core/device/device.cc | 24 +-
src/core/memory/memory.cc | 83 +--
src/core/tensor/math_kernel.cu | 682 +++++++++++---------
src/core/tensor/math_kernel.h | 98 +--
src/core/tensor/tensor.cc | 896 ++++++++++++++++-----------
src/core/tensor/tensor_math.h | 418 +++++++------
src/core/tensor/tensor_math_cpp.h | 629 ++++++++++++++-----
src/core/tensor/tensor_math_cuda.h | 429 ++++++++++---
src/io/binfile_reader.cc | 113 ++++
src/io/binfile_writer.cc | 136 ++++
src/io/jpg2proto_encoder.cc | 83 +++
src/io/proto2jpg_decoder.cc | 75 +++
src/model/layer/activation.cc | 27 +-
src/model/layer/activation.h | 7 +-
src/model/layer/batchnorm.cc | 11 +-
src/model/layer/batchnorm.h | 12 +-
src/model/layer/convolution.cc | 13 +-
src/model/layer/convolution.h | 7 +-
src/model/layer/cudnn_activation.cc | 33 +-
src/model/layer/cudnn_activation.h | 11 +-
src/model/layer/cudnn_batchnorm.cc | 132 ++--
src/model/layer/cudnn_batchnorm.h | 40 +-
src/model/layer/cudnn_convolution.cc | 114 ++--
src/model/layer/cudnn_convolution.h | 4 +-
src/model/layer/cudnn_dropout.cc | 52 +-
src/model/layer/cudnn_dropout.h | 4 +-
src/model/layer/cudnn_lrn.cc | 78 +--
src/model/layer/cudnn_lrn.h | 32 +-
src/model/layer/cudnn_pooling.cc | 48 +-
src/model/layer/cudnn_pooling.h | 4 +-
src/model/layer/cudnn_softmax.cc | 62 +-
src/model/layer/cudnn_softmax.h | 11 +-
src/model/layer/dense.cc | 7 +-
src/model/layer/dense.h | 6 +-
src/model/layer/dropout.cc | 5 +-
src/model/layer/dropout.h | 7 +-
src/model/layer/flatten.cc | 53 ++
src/model/layer/flatten.h | 56 ++
src/model/layer/lrn.cc | 5 +-
src/model/layer/lrn.h | 13 +-
src/model/layer/pooling.cc | 13 +-
src/model/layer/pooling.h | 8 +-
src/model/layer/prelu.cc | 145 +++++
src/model/layer/prelu.h | 66 ++
src/model/layer/softmax.cc | 34 +-
src/model/layer/softmax.h | 11 +-
src/model/loss/mse.cc | 41 ++
src/model/loss/mse.h | 66 --
src/model/loss/softmax_cross_entropy.cc | 53 ++
src/model/metric/accuracy.h | 5 +-
src/model/optimizer/adagrad.cc | 41 ++
src/model/optimizer/nesterov.cc | 49 ++
src/model/optimizer/optimizer.cc | 2 +-
src/model/optimizer/rmsprop.cc | 45 ++
src/model/optimizer/sgd.cc | 10 +-
src/proto/core.proto | 7 +-
src/proto/io.proto | 37 ++
src/proto/model.proto | 26 +-
src/python/device.py | 82 +++
src/python/example_layer.py | 25 +
src/python/layer.py | 78 ++-
src/python/swig/core_device.i | 60 ++
src/python/swig/core_tensor.i | 263 ++++++++
src/python/swig/model_layer.i | 83 +++
src/python/swig/singa.i | 27 +
src/python/tensor.py | 370 +++++++++++
src/utils/channel.cc | 104 ++++
test/CMakeLists.txt | 3 +-
test/python/example_test_device.py | 36 ++
test/python/example_test_tensor.py | 179 ++++++
test/python/unittest_python.py | 139 +++++
test/singa/test_activation.cc | 13 +-
test/singa/test_adagrad.cc | 96 +++
test/singa/test_binfile_rw.cc | 95 +++
test/singa/test_channel.cc | 39 ++
test/singa/test_cpp_cpu.cc | 16 +-
test/singa/test_cross_entropy.cc | 116 ++++
test/singa/test_cudnn_activation.cc | 36 +-
test/singa/test_cudnn_batchnorm.cc | 59 +-
test/singa/test_cudnn_convolution.cc | 105 ++--
test/singa/test_cudnn_dropout.cc | 35 +-
test/singa/test_cudnn_lrn.cc | 28 +-
test/singa/test_cudnn_pooling.cc | 36 +-
test/singa/test_cudnn_softmax.cc | 130 ++--
test/singa/test_decoder.cc | 84 +++
test/singa/test_dense.cc | 480 +++++++-------
test/singa/test_dropout.cc | 17 +-
test/singa/test_flatten.cc | 143 +++++
test/singa/test_initializer.cc | 148 +++++
test/singa/test_memory.cc | 129 ++--
test/singa/test_mse.cc | 12 +-
test/singa/test_nesterov.cc | 101 +++
test/singa/test_prelu.cc | 245 ++++++++
test/singa/test_rmsprop.cc | 105 ++++
test/singa/test_sgd.cc | 32 +-
test/singa/test_softmax.cc | 36 +-
test/singa/test_tensor.cc | 14 +-
test/singa/test_tensor_math.cc | 505 +++++++++++++--
120 files changed, 8172 insertions(+), 2708 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/CMakeLists.txt
----------------------------------------------------------------------
diff --cc CMakeLists.txt
index c34b6ce,87b3a5d..7a5caf3
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -10,22 -10,23 +10,23 @@@ LIST(APPEND CMAKE_MODULE_PATH ${PROJECT
IF(UNIX OR APPLE)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
ENDIF()
-
+ IF(CMAKE_BUILD_TYPE=Debug)
+ SET(NVCC_FLAG "${NVCC_FLAG} -g -G ")
+ ENDIF()
#message(STATUS "${CMAKE_CXX_FLAGS}")
-SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
-#message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
+SET(SINGA_INCLUDE_DIR
- #"${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${CMAKE_SOURCE_DIR}/lib/cnmen/include;${PROJECT_BINARY_DIR}")
+ "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/include;${PROJECT_BINARY_DIR}")
- #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
- #OPTION(CPU_ONLY "use GPU libs" OFF)
OPTION(USE_CBLAS "Use CBlas libs" ON)
OPTION(USE_CUDA "Use Cuda libs" ON)
-OPTION(USE_CUDNN "Use Cudnn libs" ON)
+OPTION(USE_CUDNN "Use Cudnn libs" OFF)
OPTION(USE_OPENCV "Use opencv" OFF)
OPTION(USE_LMDB "Use LMDB libs" OFF)
+ OPTION(USE_PYTHON "Generate py wrappers" OFF)
INCLUDE("cmake/Dependencies.cmake")
+ INCLUDE("cmake/Utils.cmake")
ADD_DEFINITIONS(-DUSE_CMAKE)
CONFIGURE_FILE (
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/cmake/Dependencies.cmake
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/common.h
----------------------------------------------------------------------
diff --cc include/singa/core/common.h
index e19022e,22a2b49..cb1bdca
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@@ -20,7 -20,9 +20,9 @@@
#define SINGA_CORE_COMMON_H_
#include <random>
#include <chrono>
+#include "./singa/singa_config.h"
+ #include <atomic>
+ #include <memory>
-#include "./singa_config.h"
#include "singa/utils/logging.h"
#ifdef USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/device.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/memory.h
----------------------------------------------------------------------
diff --cc include/singa/core/memory.h
index e4e1e63,db09043..c35f5d0
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@@ -19,56 -19,10 +19,58 @@@
#ifndef SINGA_CORE_MEMORY_H_
#define SINGA_CORE_MEMORY_H_
+#include "cnmem.h"
++#include "singa/singa_config.h"
+#include <mutex>
+
namespace singa {
/// Manage device memory pool including garbage collection, memory opt.
class VirtualMemory {};
+class DeviceMemPool {
- public:
- virtual void InitPool() = 0;
- virtual void Malloc(void** ptr, const size_t size) = 0;
- virtual void Free(void* ptr) = 0;
- virtual ~DeviceMemPool(){};
++ public:
++ virtual void InitPool() = 0;
++ virtual void Malloc(void** ptr, const size_t size) = 0;
++ virtual void Free(void* ptr) = 0;
++ virtual ~DeviceMemPool(){};
+};
+
++#ifdef USE_CUDA
+class CnMemPool : public DeviceMemPool {
- public:
- int status = 1;
++ public:
++ int status = 1;
+
- void InitPool();
++ void InitPool();
+
- /// numDevices: total number of available GPU cards.
- /// initSize: all devices will be allocated with this size
- /// manager_flags: pool manager flag (one for all devices)
- /// flag = 0; default flag
- /// flag = 1: Prevent the manager from growing its memory consumption
- /// flag = 2; Prevent the manager from stealing memory.
- void InitPool(int numDevices, size_t initSize, unsigned flag);
++ /// numDevices: total number of available GPU cards.
++ /// initSize: all devices will be allocated with this size
++ /// manager_flags: pool manager flag (one for all devices)
++ /// flag = 0; default flag
++ /// flag = 1: Prevent the manager from growing its memory consumption
++ /// flag = 2; Prevent the manager from stealing memory.
++ void InitPool(int numDevices, size_t initSize, unsigned flag);
+
- void Malloc(void** ptr, const size_t size);
- void Free(void* ptr);
++ void Malloc(void** ptr, const size_t size);
++ void Free(void* ptr);
+
- // release all memory and set cnmem manager to unintialized
- ~CnMemPool();
++ // release all memory and set cnmem manager to unintialized
++ ~CnMemPool();
+
- private:
- // whether the (global) memory pool has been initialized
- static bool initialized;
- // lock on the initialized variable
- static std::mutex mtx;
++ private:
++ // whether the (global) memory pool has been initialized
++ static bool initialized;
++ // lock on the initialized variable
++ static std::mutex mtx;
+};
+
+class CudaMemPool : public DeviceMemPool {
- public:
- void InitPool(){};
- void Malloc(void** ptr, const size_t size);
- void Free(void* ptr);
- ~CudaMemPool(){};
++ public:
++ void InitPool(){};
++ void Malloc(void** ptr, const size_t size);
++ void Free(void* ptr);
++ ~CudaMemPool(){};
+};
-
++#endif
} // namespace singa
#endif // SINGA_CORE_MEMORY_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --cc include/singa/core/tensor.h
index 8f73047,eb72bd3..a4f42db
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@@ -65,27 -54,28 +54,29 @@@ class Tensor
public:
~Tensor();
Tensor();
- explicit Tensor(Shape &&shape, const DataType dtype = kFloat32);
- explicit Tensor(const Shape &shape, const DataType dtype = kFloat32);
- Tensor(Shape &&shape, Device *dev, const DataType dtype = kFloat32);
- Tensor(const Shape &shape, Device *dev, const DataType dtype = kFloat32);
+ explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
+ explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
+ Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
- Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
++ Tensor(const Shape &shape, std::shared_ptr<Device> dev,
++ DataType dtype = kFloat32);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(const Tensor &from);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(Tensor &&from);
- /// For functions in xx_math.cc to access the blob.
- /// Users should not operate against Blob directly.
- /// blob_ is allocated in constructors.
- Blob *blob() const { return blob_; }
+ /// For functions in xx_math.cc to access the block.
+ /// Users should not operate against Block directly.
+ /// block_ is allocated in constructors.
+ Block *block() const { return block_; }
+ void SetBlock(Block* block);
- Device *device() const { return device_; }
+ std::shared_ptr<Device> device() const { return device_; }
- /// Return immutable Tensor values with given type.
- template <typename DType>
- DType data() const {
- return static_cast<DType>(blob()->data());
+ /// return immutable Tensor values with given type.
+ template <typename SType>
+ const SType* data() const {
+ return static_cast<const SType*>(block()->data());
}
/// data type, including kFloat16, kFloat32, kInt
@@@ -192,13 -179,22 +180,22 @@@
protected:
bool transpose_ = false;
DataType data_type_ = kFloat32;
- Device *device_ = nullptr;
+ std::shared_ptr<Device> device_ = nullptr;
- /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
- /// If you want to get an allocated Blob, use blob() instead of blob_.
- Blob *blob_ = nullptr;
- Shape shape_;
+ /// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
+ /// If you want to get an allocated Block, use block() instead of block_.
+ Block *block_ = nullptr;
+ Shape shape_ = {};
};
+ typedef Shape::iterator ShapeIter;
+ inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
+ if (len == 0) len = shape.size();
+ CHECK_LE(len, shape.size());
+ size_t v = 1;
+ for (unsigned int i = start; i < len; i++) v *= shape[i];
+ return v;
+ }
+
inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
CHECK_EQ(in1.data_type(), in2.data_type());
CHECK_EQ(in1.device()->lang(), in2.device()->lang());
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/model/layer.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/model/loss.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/utils/timer.h
----------------------------------------------------------------------
diff --cc include/singa/utils/timer.h
index a54829d,a54829d..bdd6c5c
--- a/include/singa/utils/timer.h
+++ b/include/singa/utils/timer.h
@@@ -19,7 -19,7 +19,7 @@@ class Timer
/// Return the duration since last call to Tick() or since the creation of
/// Timer. The template arg must be from Second or Millisecond or Hour.
/// The returned value is the count of the time metric.
-- template <typename T>
++ template <typename T = Milliseconds>
int Elapsed() const {
static_assert(std::is_same<T, Seconds>::value ||
std::is_same<T, Milliseconds>::value ||
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/CMakeLists.txt
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --cc src/core/device/cuda_gpu.cc
index 4da292f,5d4e1ed..5879c58
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@@ -32,8 -32,8 +32,7 @@@ const cudaMemcpyKind copyKind[] = {cuda
cudaMemcpyDeviceToDevice};
CudaGPU::~CudaGPU() {
-- if (ctx_.cublas_handle)
-- CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
++ if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
if (ctx_.curand_generator)
CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
#ifdef USE_CUDNN
@@@ -42,14 -42,13 +41,12 @@@
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
}
#endif
- delete pool;
++ delete pool;
}
--CudaGPU::CudaGPU(int id, int num_executors,
-- string scheduler, string vm)
++CudaGPU::CudaGPU(int id, int num_executors, string scheduler, string vm)
: Device(id, num_executors, scheduler, vm) {
-- if (id == -1)
-- id = FindDevice(0);
++ if (id == -1) id = FindDevice(0);
lang_ = kCuda;
ctx_.stream = NULL; // use the default sync stream
// TODO(wangwei) create one handle for each steam?
@@@ -68,62 -67,20 +65,57 @@@
auto status = cudnnCreate(&ctx_.cudnn_handle);
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
#endif // USE_CUDNN
-
- // initialize cnmem memory management as default
- pool = new CnMemPool();
- ((CnMemPool*)pool)->InitPool();
++
++ // initialize cnmem memory management as default
++ pool = new CnMemPool();
++ ((CnMemPool*)pool)->InitPool();
}
- CudaGPU::CudaGPU(const MemPoolConf& mem_conf,int id, int num_executors,
- string scheduler)
-void CudaGPU::SetRandSeed(unsigned seed) {
- CHECK(ctx_.curand_generator);
++CudaGPU::CudaGPU(const MemPoolConf& mem_conf, int id, int num_executors,
++ string scheduler)
+ : Device(id, num_executors, scheduler, "gc-only") {
- if (id == -1)
- id = FindDevice(0);
++ if (id == -1) id = FindDevice(0);
+ lang_ = kCuda;
+ ctx_.stream = NULL; // use the default sync stream
+ // TODO(wangwei) create one handle for each steam?
+ CUDA_CHECK(cudaSetDevice(FindDevice(0)));
+ // use curandCreateGeneratorHost for CudaHost device
CURAND_CHECK(
- curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
+ curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+ auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+ SetRandSeed(seed);
+ // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+ CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+ CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
+
+#ifdef USE_CUDNN
+ // TODO(wangwei) create one handle for each stream?
+ auto status = cudnnCreate(&ctx_.cudnn_handle);
+ CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif // USE_CUDNN
+
- // initialize memory management for cuda devices
- string memoryPoolType = mem_conf.type();
- if(memoryPoolType.compare("cnmem") == 0) {
- pool = new CnMemPool();
- int num_devices = mem_conf.num_devices();
- size_t alloc_size = mem_conf.alloc_size();
- unsigned flag = mem_conf.cnmemflag();
- ((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
- }
- else {
- pool = new CudaMemPool();
- }
++ // initialize memory management for cuda devices
++ string memoryPoolType = mem_conf.type();
++ if (memoryPoolType.compare("cnmem") == 0) {
++ pool = new CnMemPool();
++ int num_devices = mem_conf.num_devices();
++ size_t alloc_size = mem_conf.alloc_size();
++ unsigned flag = mem_conf.cnmemflag();
++ ((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
++ } else {
++ pool = new CudaMemPool();
++ }
}
-void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) {
- fn(&ctx_);
+void CudaGPU::SetRandSeed(unsigned seed) {
+ CHECK(ctx_.curand_generator);
- CURAND_CHECK(
- curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
++ CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
}
- void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) {
- fn(&ctx_);
- }
++void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); }
+
void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
-- CopyDirection direction, Context* ctx) {
++ CopyDirection direction, Context* ctx) {
cudaMemcpy(dst, src, nBytes, copyKind[direction]);
// TODO(wangwei) use async copy
// cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
@@@ -133,22 -90,19 +125,21 @@@
void* CudaGPU::Malloc(int size) {
void* ptr = nullptr;
if (size > 0) {
- //CUDA_CHECK(cudaMalloc((void**)&ptr,size));
- pool->Malloc((void**)&ptr,size);
- CUDA_CHECK(cudaMalloc(&ptr, size));
++ // CUDA_CHECK(cudaMalloc((void**)&ptr,size));
++ pool->Malloc((void**)&ptr, size);
CUDA_CHECK(cudaMemset(ptr, 0, size));
}
return ptr;
}
-- /// Free cpu memory.
++/// Free cpu memory.
void CudaGPU::Free(void* ptr) {
- if (ptr != nullptr)
- CUDA_CHECK(cudaFree(ptr));
+ if (ptr != nullptr) {
- //CUDA_CHECK(cudaFree(ptr));
- pool->Free(ptr);
- }
++ // CUDA_CHECK(cudaFree(ptr));
++ pool->Free(ptr);
++ }
}
--
// ==========Following code is from Caffe src/caffe/common.cpp=================
void CudaGPU::DeviceQuery() {
@@@ -169,20 -123,20 +160,18 @@@
LOG(INFO) << "Warp size: " << prop.warpSize;
LOG(INFO) << "Maximum memory pitch: " << prop.memPitch;
LOG(INFO) << "Maximum threads per block: " << prop.maxThreadsPerBlock;
-- LOG(INFO) << "Maximum dimension of block: "
-- << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
-- << prop.maxThreadsDim[2];
-- LOG(INFO) << "Maximum dimension of grid: "
-- << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
-- << prop.maxGridSize[2];
++ LOG(INFO) << "Maximum dimension of block: " << prop.maxThreadsDim[0]
++ << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2];
++ LOG(INFO) << "Maximum dimension of grid: " << prop.maxGridSize[0] << ", "
++ << prop.maxGridSize[1] << ", " << prop.maxGridSize[2];
LOG(INFO) << "Clock rate: " << prop.clockRate;
LOG(INFO) << "Total constant memory: " << prop.totalConstMem;
LOG(INFO) << "Texture alignment: " << prop.textureAlignment;
-- LOG(INFO) << "Concurrent copy and execution: "
-- << (prop.deviceOverlap ? "Yes" : "No");
++ LOG(INFO) << "Concurrent copy and execution: " << (prop.deviceOverlap ? "Yes"
++ : "No");
LOG(INFO) << "Number of multiprocessors: " << prop.multiProcessorCount;
LOG(INFO) << "Kernel execution timeout: "
-- << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
++ << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
return;
}
@@@ -203,6 -157,6 +192,5 @@@ int CudaGPU::FindDevice(const int start
return -1;
}
--
} // namespace singa
#endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/device.cc
----------------------------------------------------------------------
diff --cc src/core/device/device.cc
index 1889339,36381e4..6775e40
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@@ -22,11 -22,11 +22,11 @@@ namespace singa
Device::Device(int id, int num_executors, string scheduler, string vm)
: id_(id), num_executors_(num_executors) {
// TODO(wangwei) create scheduler and vm.
- host_ = &defaultDevice;
+ host_ = defaultDevice;
}
- void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
- const vector<Blob*> write_blobs, bool use_rand_generator) {
+ void Device::Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
+ const vector<Block*> write_blocks, bool use_rand_generator) {
// TODO(wangwei) execute operations scheduled by the scheduler.
DoExec(std::move(fn), 0);
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --cc src/core/memory/memory.cc
index 304c101,a1cf5db..7ac6792
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@@ -16,71 -16,5 +16,74 @@@
* limitations under the License.
*/
--
#include "singa/core/memory.h"
+#include "singa/utils/logging.h"
++#include "singa/proto/core.pb.h"
+#include <iostream>
+
++#ifdef USE_CUDA
+namespace singa {
-
+bool singa::CnMemPool::initialized = false;
+std::mutex singa::CnMemPool::mtx;
-
+void CnMemPool::InitPool(int numDevices, size_t initSize, unsigned flag) {
- mtx.lock();
- if(!initialized) {
- CHECK_GE(numDevices, 1);
- cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
- for(int i = 0; i < numDevices; i++) {
- settingPtr[i].device = i;
- settingPtr[i].size = initSize;
- settingPtr[i].numStreams = 0;
- settingPtr[i].streams = NULL;
- settingPtr[i].streamSizes = 0;
- }
- cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
- CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
- delete[] settingPtr;
- initialized = true;
- }
- mtx.unlock();
++ mtx.lock();
++ const size_t kNBytesPerMB = (1u << 20);
++ if (!initialized) {
++ CHECK_GE(numDevices, 1);
++ cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
++ for (int i = 0; i < numDevices; i++) {
++ settingPtr[i].device = i;
++ settingPtr[i].size = initSize * kNBytesPerMB;
++ settingPtr[i].numStreams = 0;
++ settingPtr[i].streams = NULL;
++ settingPtr[i].streamSizes = 0;
++ }
++ cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
++ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++ << " " << cnmemGetErrorString(status);
++ delete[] settingPtr;
++ initialized = true;
++ }
++ mtx.unlock();
+}
+
+void CnMemPool::InitPool() {
- int defaultNumDevices = 1;
- size_t defaultSize = 1000000U;
- InitPool(defaultNumDevices,defaultSize,cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
++ MemPoolConf conf;
++ InitPool(conf.num_devices(), conf.alloc_size(),
++ cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
+}
+
+CnMemPool::~CnMemPool() {
- mtx.lock();
- if(initialized) {
- cnmemStatus_t status = cnmemFinalize();
- CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
- initialized = false;
- }
- mtx.unlock();
++ mtx.lock();
++ if (initialized) {
++ cnmemStatus_t status = cnmemFinalize();
++ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++ << " " << cnmemGetErrorString(status);
++ initialized = false;
++ }
++ mtx.unlock();
+}
+
-
+void CnMemPool::Malloc(void** ptr, const size_t size) {
- cnmemStatus_t status = cnmemMalloc(ptr,size,NULL);
- CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
++ cnmemStatus_t status = cnmemMalloc(ptr, size, NULL);
++ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++ << " " << cnmemGetErrorString(status);
+}
+
+void CnMemPool::Free(void* ptr) {
- cnmemStatus_t status = cnmemFree(ptr,NULL);
- CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
++ cnmemStatus_t status = cnmemFree(ptr, NULL);
++ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++ << " " << cnmemGetErrorString(status);
+}
+
+void CudaMemPool::Malloc(void** ptr, const size_t size) {
- cudaError_t status = cudaMalloc(ptr,size);
- CHECK_EQ(status, cudaError_t::cudaSuccess);
++ cudaError_t status = cudaMalloc(ptr, size);
++ CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+void CudaMemPool::Free(void* ptr) {
- cudaError_t status = cudaFree(ptr);
- CHECK_EQ(status, cudaError_t::cudaSuccess);
++ cudaError_t status = cudaFree(ptr);
++ CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
-
+}
++#endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor.cc
index a5b43d8,9b3eeff..b852a54
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@@ -25,54 -25,66 +25,65 @@@
namespace singa {
Tensor::~Tensor() {
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
- blob_ = nullptr;
- // LOG(ERROR) << "~";
+ if (block_ != nullptr && block_->DecRefCount() == 0)
+ device_->FreeBlock(block_);
+ block_ = nullptr;
}
-Tensor::Tensor() { device_ = &defaultDevice; }
+Tensor::Tensor() { device_ = defaultDevice; }
-Tensor::Tensor(const Shape &shape, const DataType dtype)
- : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
- device_ = &defaultDevice;
+Tensor::Tensor(const Shape &shape, DataType dtype)
+ : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+ device_ = defaultDevice;
- blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+ block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(Shape &&shape, const DataType dtype)
- : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
- device_ = &defaultDevice;
+Tensor::Tensor(Shape &&shape, DataType dtype)
+ : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+ device_ = defaultDevice;
- blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+ block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(const Shape &shape, Device *device, const DataType dtype)
+Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
- blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+ block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(Shape &&shape, Device *device, const DataType dtype)
+Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
- blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
- }
- Tensor::Tensor(const Tensor &t)
- : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
- blob_(t.blob()), shape_(t.shape_) {
- blob_->IncRefCount();
- // LOG(ERROR) << "const&";
- }
-
- Tensor::Tensor(Tensor &&t)
- : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
- shape_(std::move(t.shape_)) {
- blob_ = t.blob_;
- t.blob_ = nullptr;
- // LOG(ERROR) << "&&";
- }
-
- void Tensor::ResetLike(const Tensor &t) {
- if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
- shape_ = t.shape_;
- device_ = t.device_;
- data_type_ = t.data_type_;
- blob_ = device_->NewBlob(t.MemSize());
+ block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
+ }
+ Tensor::Tensor(const Tensor &in)
+ : transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ block_(in.block()),
+ shape_(in.shape_) {
+ block_->IncRefCount();
+ }
+
+ Tensor::Tensor(Tensor &&in)
+ : transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ shape_(std::move(in.shape_)) {
+ block_ = in.block_;
+ in.block_ = nullptr;
+ }
+
+ void Tensor::SetBlock(Block* block) {
+ LOG(WARNING) << "Pls avoid using this function, which may have side-effect.";
+ if (block_ != nullptr)
+ if (block_->DecRefCount())
+ device_->FreeBlock(block_);
+ block_ = block;
+ }
+
+ void Tensor::ResetLike(const Tensor &in) {
+ if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+ if (block_ != nullptr && block_->DecRefCount() == 0)
+ device_->FreeBlock(block_);
+ shape_ = in.shape_;
+ device_ = in.device_;
+ data_type_ = in.data_type_;
+ block_ = device_->NewBlock(in.MemSize());
}
}
@@@ -228,13 -245,13 +244,13 @@@ void CopyDataToFrom(Tensor *dst, const
auto width = SizeOf(src.data_type());
CHECK_EQ(width, SizeOf(dst->data_type()));
size_t nBytes = num * width;
- dst_offset *= width;
- src_offset *= width;
- CHECK_GE(src.MemSize(), src_offset + nBytes);
- CHECK_GE(dst->MemSize(), dst_offset + nBytes);
+ auto d_offset = dst_offset * width;
+ auto s_offset = src_offset * width;
+ CHECK_GE(src.MemSize(), s_offset + nBytes);
+ CHECK_GE(dst->MemSize(), d_offset + nBytes);
- Device *src_dev = src.device(), *dst_dev = dst->device();
+ std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
- Blob *from = src.blob(), *to = dst->blob();
+ Block *from = src.block(), *to = dst->block();
if (dst_dev->lang() != src_dev->lang()) {
// let the none cpp device conduct copy op
if (dst_dev->lang() == kCpp) {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/batchnorm.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/batchnorm.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_activation.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_batchnorm.cc
index 8288a41,0e597fe..a1e9e50
--- a/src/model/layer/cudnn_batchnorm.cc
+++ b/src/model/layer/cudnn_batchnorm.cc
@@@ -30,7 -30,7 +30,7 @@@ CudnnBatchNorm::~CudnnBatchNorm()
}
}
--void CudnnBatchNorm::ToDevice(Device* device) {
++void CudnnBatchNorm::ToDevice(std::shared_ptr<Device> device) {
BatchNorm::ToDevice(device);
resultSaveMean_.ToDevice(device);
resultSaveVariance_.ToDevice(device);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_batchnorm.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_batchnorm.h
index 8598b65,36dbbce..4f46452
--- a/src/model/layer/cudnn_batchnorm.h
+++ b/src/model/layer/cudnn_batchnorm.h
@@@ -29,31 -29,29 +29,29 @@@
namespace singa {
class CudnnBatchNorm : public BatchNorm {
public:
- ~CudnnBatchNorm();
- /// \copy doc Layer::layer_type()
- const std::string layer_type() const override {
- return "CudnnBatchNorm";
- }
+ ~CudnnBatchNorm();
+ /// \copy doc Layer::layer_type()
+ const std::string layer_type() const override { return "CudnnBatchNorm"; }
- void Setup(const LayerConf& conf) override;
+ void Setup(const Shape& in_sample, const LayerConf& conf) override;
- const Tensor Forward(int flag, const Tensor& input)
- override;
- const std::pair<Tensor, vector<Tensor>> Backward(
- int flag, const Tensor& grad) override;
+ const Tensor Forward(int flag, const Tensor& input) override;
+ const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+ const Tensor& grad) override;
- void ToDevice(Device* device) override;
++ void ToDevice(std::shared_ptr<Device> device) override;
- /// Init cudnn related data structures.
- void InitCudnn(const Shape& shape, DataType dtype);
- void ToDevice(Device* device) override;
+ private:
+ /// Init cudnn related data structures.
+ void InitCudnn(const Shape& shape, DataType dtype);
private:
- bool has_init_cudnn_ = false;
- cudnnBatchNormMode_t mode_;
- cudnnLRNDescriptor_t lrn_desc_;
- cudnnTensorDescriptor_t shape_desc_, param_desc_;
- Tensor resultSaveMean_, resultSaveVariance_;
-
- }; // class CudnnBatchNorm
+ bool has_init_cudnn_ = false;
+ cudnnBatchNormMode_t mode_;
+ cudnnLRNDescriptor_t lrn_desc_ = nullptr;
+ cudnnTensorDescriptor_t shape_desc_ = nullptr, param_desc_ = nullptr;
+ Tensor resultSaveMean_, resultSaveVariance_;
+
+ }; // class CudnnBatchNorm
} // namespace
#endif // USE_CUDNN
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_convolution.cc
index b80c3bd,8cdfc07..d5ac2a3
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@@ -46,7 -46,7 +46,7 @@@ void CudnnConvolution::Setup(const Shap
"limited_workspace, no_workspace and autotune";
}
--void CudnnConvolution::ToDevice(Device *device) {
++void CudnnConvolution::ToDevice(std::shared_ptr<Device> device) {
weight_.ToDevice(device);
bias_.ToDevice(device);
workspace_.ToDevice(device);
@@@ -55,7 -55,7 +55,7 @@@
void CudnnConvolution::InitCudnn(const Tensor &input) {
CHECK(!has_init_cudnn_);
DataType dtype = input.data_type();
-- Device *dev = input.device();
++ auto dev = input.device();
Context *ctx = dev->context(0);
size_t batchsize = input.shape(0);
CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
@@@ -161,7 -161,7 +161,7 @@@ const Tensor CudnnConvolution::Forward(
if (flag & kTrain) buf_.push(input); // buffer the input for backward
size_t batchsize = input.shape()[0];
DataType dtype = input.data_type();
-- Device *dev = input.device();
++ auto dev = input.device();
if (!has_init_cudnn_) InitCudnn(input);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_convolution.h
index 152d797,6c15839..cd0471f
--- a/src/model/layer/cudnn_convolution.h
+++ b/src/model/layer/cudnn_convolution.h
@@@ -41,9 -41,9 +41,9 @@@ class CudnnConvolution : public Convolu
const Tensor &grad) override;
/// \copydoc Layer::Setup(const LayerConf&);
- void Setup(const LayerConf &conf) override;
+ void Setup(const Shape& in_sample, const LayerConf &conf) override;
-- void ToDevice(Device *device) override;
++ void ToDevice(std::shared_ptr<Device> device) override;
size_t workspace_byte_limit() { return workspace_byte_limit_; }
string prefer() { return prefer_; }
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_dropout.cc
index 64a581b,877dd12..2e2e12b
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@@ -34,8 -34,8 +34,8 @@@ CudnnDropout::~CudnnDropout()
if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
}
--void CudnnDropout::InitCudnn(int size, DataType dtype, Device* dev,
-- Context* ctx) {
++void CudnnDropout::InitCudnn(int size, DataType dtype,
++ std::shared_ptr<Device> dev, Context* ctx) {
CHECK(!has_init_cudnn_);
CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
@@@ -65,13 -65,11 +65,11 @@@ const Tensor CudnnDropout::Forward(int
if (flag & kTrain) {
auto size = input.Size();
DataType dtype = input.data_type();
-- Device* dev = input.device();
++ auto dev = input.device();
if (!has_init_cudnn_) {
- input.device()->Exec(
- [size, dtype, this, dev](Context* ctx) {
- this->InitCudnn(size, dtype, dev, ctx);
- },
- {}, {this->state_.blob()});
+ input.device()->Exec([size, dtype, this, dev](Context* ctx) {
+ this->InitCudnn(size, dtype, dev, ctx);
+ }, {}, {this->state_.block()});
}
Tensor output;
output.ResetLike(input);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_dropout.h
index da3d1d2,83572cf..6809653
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@@ -42,8 -42,9 +42,10 @@@ class CudnnDropout : public Dropout
const std::pair<Tensor, vector<Tensor>> Backward(int flag,
const Tensor& grad) override;
+ private:
/// Init cudnn related data structures.
-- void InitCudnn(int size, DataType dtype, Device* dev, Context* ctx);
++ void InitCudnn(int size, DataType dtype, std::shared_ptr<Device> dev,
++ Context* ctx);
private:
bool has_init_cudnn_ = false;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_lrn.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_pooling.cc
index 842685d,9d288c0..6d7a5b1
--- a/src/model/layer/cudnn_pooling.cc
+++ b/src/model/layer/cudnn_pooling.cc
@@@ -82,7 -82,7 +82,7 @@@ const Tensor CudnnPooling::Forward(int
CHECK_EQ(input.nDim(), 4u);
size_t batchsize = input.shape(0);
DataType dtype = input.data_type();
-- Device *dev = input.device();
++ auto dev = input.device();
if (!has_init_cudnn_) InitCudnn(input);
Shape shape{batchsize, channels_, pooled_height_, pooled_width_};
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dense.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dense.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dropout.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dropout.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/prelu.cc
----------------------------------------------------------------------
diff --cc src/model/layer/prelu.cc
index 0000000,83a56fa..6eb09d9
mode 000000,100644..100644
--- a/src/model/layer/prelu.cc
+++ b/src/model/layer/prelu.cc
@@@ -1,0 -1,145 +1,145 @@@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #include "singa/model/layer.h"
+ #include "./prelu.h"
+ namespace singa {
+
+ void PReLU::Setup(const Shape& in_sample, const LayerConf &conf) {
+ Layer::Setup(in_sample, conf);
+ out_sample_shape_ = in_sample;
+ channel_shared_ = conf.prelu_conf().channel_shared();
+ format_ = conf.prelu_conf().format();
+ // Push back params into param_values_
+ for (const auto &spec : conf.param()) param_specs_.push_back(spec);
+ param_values_.push_back(&a_);
+ }
+
+ const Tensor PReLU::Forward(int flag, const Tensor &input) {
+ Tensor output;
+ if (!channel_shared_) {
+ size_t n, c, h, w;
+ Tensor temp = (input <= 0.f);
+ if (temp.nDim() == 4) {
+ if (format_ == "NCHW") {
+ n = temp.shape(0);
+ c = temp.shape(1);
+ h = temp.shape(2);
+ w = temp.shape(3);
+ temp.Reshape(Shape{n * c, h * w});
+ Tensor temp_a(Shape{n, c}, input.device(), input.data_type());
+ Uniform(1.f, 1.f, &temp_a);
+ MultRow(a_, &temp_a);
+ temp_a.Reshape(Shape{n * c});
+ MultColumn(temp_a, &temp);
+ } else if (format_ == "NHWC") {
+ n = temp.shape(0);
+ h = temp.shape(1);
+ w = temp.shape(2);
+ c = temp.shape(3);
+ temp.Reshape(Shape{n * h * w, c});
+ MultRow(a_, &temp);
+ } else {
+ LOG(FATAL) << "Incorrect input format for prelu layer.";
+ }
+ } else {
+ LOG(FATAL) << "Incorrect input format for prelu layer.";
+ }
+ output = input * ((input > 0.f) + temp);
+ } else {
+ // share the first param of Tensor A along all channels
+ LOG(FATAL) << "Not implemented";
+ // TODO(wangwei) cannot access the data in this way. The data could be on GPU.
+ auto a = a_.data<float>()[0];
+ output = input * ((input > 0.f) + (input <= 0.f) * a);
+ }
+ if (flag & kTrain) buf_.push(input);
+ return output;
+ }
+
+ const std::pair<Tensor, vector<Tensor> > PReLU::Backward(int flag,
+ const Tensor &grad) {
+ vector<Tensor> param_grad;
+ CHECK(!buf_.empty());
+ Tensor input_grad, input = buf_.top();
+ buf_.pop();
+ Tensor da;
+ da.ResetLike(a_);
+ if (!channel_shared_) {
+ size_t n, c, h, w;
+ Tensor temp1 = (input <= 0.f);
+ if (temp1.nDim() == 4) {
+ if (format_ == "NCHW") {
+ n = temp1.shape(0);
+ c = temp1.shape(1);
+ h = temp1.shape(2);
+ w = temp1.shape(3);
+ temp1.Reshape(Shape{n * c, h * w});
+ Tensor temp_a(Shape{n, c}, grad.device(), grad.data_type());
+ Uniform(1.f, 1.f, &temp_a);
+ MultRow(a_, &temp_a);
+ temp_a.Reshape(Shape{n * c});
+ MultColumn(temp_a, &temp1);
+ temp1.Reshape(Shape{n, c, h, w});
+ } else if (format_ == "NHWC") {
+ n = temp1.shape(0);
+ h = temp1.shape(1);
+ w = temp1.shape(2);
+ c = temp1.shape(3);
+ temp1.Reshape(Shape{n * h * w, c});
+ MultRow(a_, &temp1);
+ temp1.Reshape(Shape{n, h, w, c});
+ } else {
+ LOG(FATAL) << "Incorrect input format for prelu layer.";
+ }
+ } else {
+ LOG(FATAL) << "Incorrect input format for prelu layer.";
+ }
+ input_grad = grad * input * ((input > 0.f) + temp1);
+ Tensor temp2 = grad * input * (input <= 0.f);
+ if (format_ == "NCHW") {
+ Tensor temp3(Shape{n * c}, grad.device(), grad.data_type());
+ temp2.Reshape(Shape{n * c, h * w});
+ SumColumns(temp2, &temp3);
+ temp3.Reshape(Shape{n, c});
+ SumRows(temp3, &da);
+ } else if (format_ == "NHWC") {
+ temp2.Reshape(Shape{n * h * w, c});
+ SumRows(temp2, &da);
+ }
+ } else {
+ // share the first param of Tensor A along all channels
+ LOG(FATAL) << "Not Implemented";
+ // TODO(wangwei) cannot access the data in this way. The data could be on GPU.
+ auto a = a_.data<float>()[0];
+ input_grad = grad * input * ((input > 0.f) + (input <= 0.f) * a);
+ Tensor temp = grad * input * (input <= 0.f);
+ float sum = Sum<float>(temp);
+ Uniform(1.f, 1.f, &da);
+ da *= sum;
+ }
+ param_grad.push_back(da);
+ return std::make_pair(input_grad, param_grad);
+ }
+
-void PReLU::ToDevice(Device *device) {
++void PReLU::ToDevice(std::shared_ptr<Device> device) {
+ Layer::ToDevice(device);
+ a_.ToDevice(device);
+ }
+
+ } // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/prelu.h
----------------------------------------------------------------------
diff --cc src/model/layer/prelu.h
index 0000000,ee571e1..70a9dcf
mode 000000,100644..100644
--- a/src/model/layer/prelu.h
+++ b/src/model/layer/prelu.h
@@@ -1,0 -1,66 +1,66 @@@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ #ifndef SINGA_MODEL_LAYER_PRELU_H_
+ #define SINGA_MODEL_LAYER_PRELU_H_
+ #include <utility>
+ #include <string>
+ #include <vector>
+ #include "singa/model/layer.h"
-#include "singa_config.h"
++#include "singa/singa_config.h"
+
+ namespace singa {
+ class PReLU : public Layer {
+ public:
+ /// \copydoc Layer::layer_type()
+ const std::string layer_type() const override { return "PReLU"; }
+
+
+ /// \copydoc Layer::Setup(const LayerConf&);
+ void Setup(const Shape& in_sample, const LayerConf& conf) override;
+ const Shape GetOutputSampleShape() const override {
+ CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+ return out_sample_shape_;
+ }
+
+ /// \copydoc Layer::Forward(int flag, const Tensor&)
+ const Tensor Forward(int flag, const Tensor &input) override;
+
+ /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+ const std::pair<Tensor, vector<Tensor> > Backward(
+ int flag, const Tensor &grad) override;
+
- void ToDevice(Device *device);
++ void ToDevice(std::shared_ptr<Device> device);
+
+ const bool Channel_shared() const { return channel_shared_; }
+ const Tensor A() const { return a_; }
+ const std::string Format() const { return format_; }
+
+ void Set_a(Tensor a) {
+ a_.ResetLike(a);
+ a_.CopyData(a);
+ }
+
+ protected:
+ bool channel_shared_;
+ std::string format_; // format_ has two valid value, i.e. NCHW, NHWC
+ Tensor a_; // shape of a_ is 2D, i.e. (channels, 1)
+ std::stack<Tensor> buf_;
+ Shape out_sample_shape_;
+ };
+ } // namespace singa
+ #endif // SINGA_MODEL_LAYER_PRELU_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/proto/core.proto
----------------------------------------------------------------------
diff --cc src/proto/core.proto
index cf6e193,3031359..b853b30
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@@ -44,16 -45,3 +45,16 @@@ enum CopyDirection
kDeviceToDevice = 3;
kNumDirection = 4;
}
+
+// configuration for device memory pool
+message MemPoolConf {
+ optional string type = 1 [default = "cnmem"];
+ optional uint32 num_devices = 2 [default = 1];
- // allocation size for each device
- optional uint32 alloc_size = 3 [default = 10000000];
++ // allocation size for each device, default is 256 MB
++ optional uint32 alloc_size = 3 [default = 256];
+ // memory manager flag for cnmem
+ // cnmemflag = 0: default flag
+ // cnmemflag = 1: prevent the manager from growing its memory consumption
+ // cnmemflag = 2: prevent the manager from stealing memory
+ optional uint32 cnmemflag = 4 [default = 0];
+}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_adagrad.cc
----------------------------------------------------------------------
diff --cc test/singa/test_adagrad.cc
index 0000000,642e929..c45dcef
mode 000000,100644..100644
--- a/test/singa/test_adagrad.cc
+++ b/test/singa/test_adagrad.cc
@@@ -1,0 -1,96 +1,96 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
-#include "singa_config.h"
++#include "singa/singa_config.h"
+ #include <cmath>
+
+ TEST(Adagrad, ApplyCPU) {
+ singa::Adagrad adagrad;
+ float lr = 0.1f;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ singa::OptimizerConf conf;
+ adagrad.Setup(conf);
+ adagrad.Apply(0, lr, "xx", grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ const float* newv1 = v1.data<float>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ adagrad.Apply(1, lr, "xx", grad, &value);
+ singa::Tensor v2 = value.Clone();
+ const float* newv2 = v2.data<float>();
+ for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv2[i],
+ newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+
+ #ifdef USE_CUDA
+ TEST(Adagrad, ApplyCUDA) {
+ singa::Adagrad adagrad;
+ float lr = 0.1f;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
- singa::CudaGPU dev;
- singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ singa::OptimizerConf conf;
+ adagrad.Setup(conf);
+ adagrad.Apply(0, lr, "xx", grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ v1.ToHost();
+ const float* newv1 = v1.data<float>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ adagrad.Apply(1, lr, "xx", grad, &value);
+ singa::Tensor v2 = value.Clone();
+ v2.ToHost();
+ const float* newv2 = v2.data<float>();
+ for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+ for (int i = 0; i < 4; ++i)
+ EXPECT_FLOAT_EQ(newv2[i],
+ newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()));
+ }
+ #endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cross_entropy.cc
index 0000000,ce60f7c..d73591f
mode 000000,100644..100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@@ -1,0 -1,116 +1,116 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "gtest/gtest.h"
+ #include "singa/core/tensor.h"
+ #include "singa/core/device.h"
+ #include "singa/model/loss.h"
-#include "singa_config.h"
++#include "singa/singa_config.h"
+
+ using singa::Tensor;
+ class TestSoftmaxCrossEntropy : public ::testing::Test {
+ protected:
+ virtual void SetUp() {
+ p.Reshape(singa::Shape{2, 4});
+ t.Reshape(singa::Shape{2, 1});
+ }
+ const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+ const int tdat[2] = {0, 2};
+
+ singa::Tensor p, t;
+ };
+
+ TEST_F(TestSoftmaxCrossEntropy, CppForward) {
+ p.CopyDataFromHostPtr(pdat, 8);
+ t.AsType(singa::kInt);
+ t.CopyDataFromHostPtr(tdat, 2);
+
+ singa::SoftmaxCrossEntropy cross_entropy;
+ const Tensor& loss = cross_entropy.Forward(p, t);
+ auto ldat = loss.data<float>();
+
+ const float result_test = -log(0.25);
+ EXPECT_FLOAT_EQ(ldat[0], result_test);
+ EXPECT_FLOAT_EQ(ldat[1], result_test);
+ }
+
+ TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
+ p.CopyDataFromHostPtr(pdat, 8);
+ t.AsType(singa::kInt);
+ t.CopyDataFromHostPtr(tdat, 2);
+
+ singa::SoftmaxCrossEntropy cross_entropy;
+ cross_entropy.Forward(p, t);
+ const Tensor& grad = cross_entropy.Backward();
+
+ auto gdat = grad.data<float>();
+ EXPECT_FLOAT_EQ(gdat[0], -0.75);
+ EXPECT_FLOAT_EQ(gdat[1], 0.25);
+ EXPECT_FLOAT_EQ(gdat[2], 0.25);
+ EXPECT_FLOAT_EQ(gdat[3], 0.25);
+ EXPECT_FLOAT_EQ(gdat[4], 0.25);
+ EXPECT_FLOAT_EQ(gdat[5], 0.25);
+ EXPECT_FLOAT_EQ(gdat[6], -0.75);
+ EXPECT_FLOAT_EQ(gdat[7], 0.25);
+ }
+
+ #ifdef USE_CUDA
+
+ TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
+ singa::SoftmaxCrossEntropy cross_entropy;
- singa::CudaGPU dev;
- p.ToDevice(&dev);
- t.ToDevice(&dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ p.ToDevice(dev);
++ t.ToDevice(dev);
+ p.CopyDataFromHostPtr(pdat, 8);
+ t.CopyDataFromHostPtr(tdat, 2);
+
+ Tensor loss = cross_entropy.Forward(p, t);
+ loss.ToHost();
+ auto ldat = loss.data<float>();
+
+ const float result_test = -log(0.25);
+ EXPECT_FLOAT_EQ(ldat[0], result_test);
+ EXPECT_FLOAT_EQ(ldat[1], result_test);
+ }
+
+ TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
+ singa::SoftmaxCrossEntropy cross_entropy;
- singa::CudaGPU dev;
- p.ToDevice(&dev);
- t.ToDevice(&dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ p.ToDevice(dev);
++ t.ToDevice(dev);
+ p.CopyDataFromHostPtr(pdat, 8);
+ t.CopyDataFromHostPtr(tdat, 2);
+
+ cross_entropy.Forward(p, t);
+ Tensor grad = cross_entropy.Backward();
+
+ grad.ToHost();
+ auto gdat = grad.data<float>();
+ EXPECT_FLOAT_EQ(gdat[0], -0.75);
+ EXPECT_FLOAT_EQ(gdat[1], 0.25);
+ EXPECT_FLOAT_EQ(gdat[2], 0.25);
+ EXPECT_FLOAT_EQ(gdat[3], 0.25);
+ EXPECT_FLOAT_EQ(gdat[4], 0.25);
+ EXPECT_FLOAT_EQ(gdat[5], 0.25);
+ EXPECT_FLOAT_EQ(gdat[6], -0.75);
+ EXPECT_FLOAT_EQ(gdat[7], 0.25);
+ }
+ #endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_activation.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_activation.cc
index bed7715,940c6b9..1a619e7
--- a/test/singa/test_cudnn_activation.cc
+++ b/test/singa/test_cudnn_activation.cc
@@@ -46,8 -46,8 +46,8 @@@ TEST(TCudnnActivation, Setup)
TEST(TCudnnActivation, Forward) {
const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{n}, cuda);
in.CopyDataFromHostPtr<float>(x, n);
float neg_slope = 0.5f;
@@@ -66,9 -65,9 +65,8 @@@
singa::Tensor out = acti.Forward(singa::kTrain, in);
EXPECT_EQ(n, out.Size());
-- singa::CppCPU host(0, 1);
-- out.ToDevice(&host);
- const float* yptr = out.data<const float*>();
++ out.ToHost();
+ const float* yptr = out.data<float>();
float* y = new float[n];
if (acti.Mode() == "SIGMOID") {
for (size_t i = 0; i < n; i++) y[i] = 1.f / (1.f + exp(-x[i]));
@@@ -87,8 -86,8 +85,8 @@@
TEST(TCudnnActivation, Backward) {
const float x[] = {2.0f, 3.0f, 3.0f, 7.f, 0.0f, 5.0, 1.5, 2.5, -2.5, 1.5};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{n}, cuda);
in.CopyDataFromHostPtr<float>(x, n);
float neg_slope = 0.5f;
std::string types[] = {"SIGMOID", "TANH", "RELU"};
@@@ -101,22 -100,21 +99,20 @@@
singa::ReLUConf* reluconf = conf.mutable_relu_conf();
reluconf->set_negative_slope(neg_slope);
}
- acti.Setup(conf);
- acti.InitCudnn(n, singa::kFloat32);
+ acti.Setup(Shape{n}, conf);
singa::Tensor out = acti.Forward(singa::kTrain, in);
EXPECT_EQ(n, out.Size());
-- singa::CppCPU host(0, 1);
-- out.ToDevice(&host);
- const float* yptr = out.data<const float*>();
++ out.ToHost();
+ const float* yptr = out.data<float>();
const float grad[] = {2.0f, 1.0f, 2.0f, 0.0f, -2.0f,
-1.0, 1.5, 2.5, -1.5, -2.5};
-- singa::Tensor out_diff(singa::Shape{n}, &cuda);
++ singa::Tensor out_diff(singa::Shape{n}, cuda);
out_diff.CopyDataFromHostPtr<float>(grad, n);
const auto ret = acti.Backward(singa::kTrain, out_diff);
singa::Tensor in_diff = ret.first;
-- in_diff.ToDevice(&host);
- const float* xptr = in_diff.data<const float*>();
++ in_diff.ToHost();
+ const float* xptr = in_diff.data<float>();
float* dx = new float[n];
if (acti.Mode() == "SIGMOID") {
for (size_t i = 0; i < n; i++) dx[i] = grad[i] * yptr[i] * (1. - yptr[i]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_batchnorm.cc
index d38fdaa,b3b6477..7067b16
--- a/test/singa/test_cudnn_batchnorm.cc
+++ b/test/singa/test_cudnn_batchnorm.cc
@@@ -56,34 -53,31 +53,30 @@@ TEST(CudnnBatchNorm, Forward)
0.150676, 0.153442, -0.0929899, -0.148675,
-0.112459, -0.106284, -0.103074, -0.0668811
};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{1,2,4,4}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
in.CopyDataFromHostPtr(x, 1*2*4*4);
const float alpha_[] = {1, 1};
-- singa::Tensor alpha(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor alpha(singa::Shape{1,2,1,1}, cuda);
alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
const float beta_[] = {0, 0};
-- singa::Tensor beta(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor beta(singa::Shape{1,2,1,1}, cuda);
beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
singa::LayerConf conf;
singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
batchnorm_conf->set_factor(0.9);
- batchnorm_conf->set_channels(2);
- batchnorm_conf->set_height(4);
- batchnorm_conf->set_width(4);
- batchnorm.Setup(conf);
+ batchnorm.Setup(Shape{2, 4, 4}, conf);
-- batchnorm.ToDevice(&cuda);
++ batchnorm.ToDevice(cuda);
batchnorm.set_bnScale(alpha);
batchnorm.set_bnBias(beta);
batchnorm.set_runningMean(beta);
batchnorm.set_runningVariance(beta);
singa::Tensor out = batchnorm.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
out.ToHost();
- const float *outptr = out.data<const float *>();
+ const float *outptr = out.data<float>();
const auto & shape = out.shape();
EXPECT_EQ(4u, shape.size());
EXPECT_EQ(1u, shape[0]);
@@@ -136,8 -130,8 +129,8 @@@ TEST(CudnnBatchNorm, Backward)
0.150676, 0.153442, -0.0929899, -0.148675,
-0.112459, -0.106284, -0.103074, -0.0668811
};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor x_tensor(singa::Shape{1,2,4,4}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
singa::LayerConf conf;
@@@ -159,35 -150,35 +149,34 @@@
0.00468428, 0.00735506, -0.00682525, 0.00342023
};
-- singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, &cuda);
++ singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, cuda);
dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
const float alpha_[] = {1, 1};
-- singa::Tensor alpha(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor alpha(singa::Shape{1,2,1,1}, cuda);
alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
const float beta_[] = {0, 0};
-- singa::Tensor beta(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor beta(singa::Shape{1,2,1,1}, cuda);
beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
const float mean_[] = {0.0123405, -0.0622333};
-- singa::Tensor mean(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor mean(singa::Shape{1,2,1,1}, cuda);
mean.CopyDataFromHostPtr(mean_, 1*2*1*1);
const float var_[] = {15.9948, 8.68198};
-- singa::Tensor var(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor var(singa::Shape{1,2,1,1}, cuda);
var.CopyDataFromHostPtr(var_, 1*2*1*1);
-- batchnorm.ToDevice(&cuda);
++ batchnorm.ToDevice(cuda);
batchnorm.set_bnScale(alpha);
batchnorm.set_bnBias(beta);
batchnorm.set_runningMean(beta);
batchnorm.set_runningVariance(beta);
batchnorm.Forward(singa::kTrain, x_tensor);
const auto ret = batchnorm.Backward(singa::kTrain, dy_tensor);
-- singa::CppCPU host(0, 1);
singa::Tensor dx = ret.first;
-- dx.ToDevice(&host);
- const float *dxptr = dx.data<const float *>();
++ dx.ToHost();
+ const float *dxptr = dx.data<float>();
const auto & shape = dx.shape();
EXPECT_EQ(4u, shape.size());
EXPECT_EQ(1u, shape[0]);
@@@ -228,8 -219,8 +217,8 @@@
EXPECT_NEAR(0.0217477, dxptr[31], 1e-4f);
singa::Tensor dbnScale = ret.second.at(0);
-- dbnScale.ToDevice(&host);
- const float *dbnScaleptr = dbnScale.data<const float *>();
++ dbnScale.ToHost();
+ const float *dbnScaleptr = dbnScale.data<float>();
const auto & dbnScaleShape = dbnScale.shape();
EXPECT_EQ(4u, dbnScaleShape.size());
EXPECT_EQ(1u, dbnScaleShape[0]);
@@@ -241,8 -232,8 +230,8 @@@
EXPECT_NEAR(-0.00219431f, dbnScaleptr[1], 1e-4f);
singa::Tensor dbnBias = ret.second.at(1);
-- dbnBias.ToDevice(&host);
- const float *dbnBiasptr = dbnBias.data<const float *>();
++ dbnBias.ToHost();
+ const float *dbnBiasptr = dbnBias.data<float>();
const auto & dbnBiasShape = dbnBias.shape();
EXPECT_EQ(4u, dbnBiasShape.size());
EXPECT_EQ(1u, dbnBiasShape[0]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_convolution.cc
index 2a17da2,44077b7..3b84645
--- a/test/singa/test_cudnn_convolution.cc
+++ b/test/singa/test_cudnn_convolution.cc
@@@ -65,18 -63,18 +63,18 @@@ TEST(CudnnConvolution, Forward)
const size_t batchsize = 1, c = 1, h = 3, w = 3;
const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * h * w);
// Set weight and bias manually
const size_t num_filters = 1;
const float we[num_filters * batchsize * h * w] = {
1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
-- singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
++ singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, cuda);
weight.CopyDataFromHostPtr(we, batchsize * h * w);
const float b[num_filters] = {1.0f};
-- singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++ singa::Tensor bias(singa::Shape{num_filters}, cuda);
bias.CopyDataFromHostPtr(b, num_filters);
CudnnConvolution conv;
conv.set_weight(weight);
@@@ -102,9 -97,9 +97,8 @@@
// Parameter "flag" does not influence convolution
singa::Tensor out1 = conv.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out1.ToDevice(&host);
- const float *outptr1 = out1.data<const float *>();
++ out1.ToHost();
+ const float *outptr1 = out1.data<float>();
// Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
EXPECT_EQ(4u, out1.Size());
@@@ -119,8 -114,8 +113,8 @@@ TEST(CudnnConvolution, Backward)
const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
// Set weight_ and bias_ manually
@@@ -128,10 -123,10 +122,10 @@@
const float we[num_filters * batchsize * src_h * src_w] = {
1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
-- &cuda);
++ cuda);
weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
const float b[num_filters] = {1.0f};
-- singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++ singa::Tensor bias(singa::Shape{num_filters}, cuda);
bias.CopyDataFromHostPtr(b, num_filters);
CudnnConvolution conv;
conv.set_weight(weight);
@@@ -162,14 -154,14 +153,13 @@@
const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
0.4f};
singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
-- &cuda);
++ cuda);
grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
const auto ret = conv.Backward(singa::kTrain, grad);
-- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
-- in_grad.ToDevice(&host);
- const float *dx = in_grad.data<const float *>();
++ in_grad.ToHost();
+ const float *dx = in_grad.data<float>();
const float *wptr = we;
EXPECT_EQ(9u, in_grad.Size());
EXPECT_EQ(dy[0] * wptr[4], dx[0]);
@@@ -186,12 -178,12 +176,12 @@@
singa::Tensor dw = ret.second[0];
singa::Tensor db = ret.second[1];
-- dw.ToDevice(&host);
-- db.ToDevice(&host);
- const float *dbptr = db.data<const float *>();
++ dw.ToHost();
++ db.ToHost();
+ const float *dbptr = db.data<float>();
EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
- const float *dwptr = dw.data<const float *>();
+ const float *dwptr = dw.data<float>();
EXPECT_EQ(9u, dw.Size());
EXPECT_EQ(dy[3] * x[4], dwptr[0]);
EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
@@@ -246,18 -235,18 +233,19 @@@ TEST(CudnnConvolution_AT, Forward)
const size_t batchsize = 1, c = 1, h = 3, w = 3;
const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * h * w);
// Set weight and bias manually
const size_t num_filters = 1;
const float we[num_filters * batchsize * h * w] = {
1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
-- singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
++ singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, cuda);
weight.CopyDataFromHostPtr(we, batchsize * h * w);
const float b[num_filters] = {1.0f};
-- singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++ singa::Tensor bias(singa::Shape{num_filters}, cuda);
bias.CopyDataFromHostPtr(b, num_filters);
CudnnConvolution conv;
conv.set_weight(weight);
@@@ -283,9 -269,9 +268,8 @@@
// Parameter "flag" does not influence convolution
singa::Tensor out1 = conv.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out1.ToDevice(&host);
- const float *outptr1 = out1.data<const float *>();
++ out1.ToHost();
+ const float *outptr1 = out1.data<float>();
// Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
EXPECT_EQ(4u, out1.Size());
@@@ -300,8 -286,8 +284,9 @@@ TEST(CudnnConvolution_AT, Backward)
const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
// Set weight_ and bias_ manually
@@@ -309,10 -295,10 +294,10 @@@
const float we[num_filters * batchsize * src_h * src_w] = {
1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
-- &cuda);
++ cuda);
weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
const float b[num_filters] = {1.0f};
-- singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++ singa::Tensor bias(singa::Shape{num_filters}, cuda);
bias.CopyDataFromHostPtr(b, num_filters);
CudnnConvolution conv;
conv.set_weight(weight);
@@@ -343,14 -326,14 +325,13 @@@
const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
0.4f};
singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
-- &cuda);
++ cuda);
grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
const auto ret = conv.Backward(singa::kTrain, grad);
-- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
-- in_grad.ToDevice(&host);
- const float *dx = in_grad.data<const float *>();
++ in_grad.ToHost();
+ const float *dx = in_grad.data<float>();
const float *wptr = we;
EXPECT_EQ(9u, in_grad.Size());
EXPECT_EQ(dy[0] * wptr[4], dx[0]);
@@@ -367,12 -350,12 +348,12 @@@
singa::Tensor dw = ret.second[0];
singa::Tensor db = ret.second[1];
-- dw.ToDevice(&host);
-- db.ToDevice(&host);
- const float *dbptr = db.data<const float *>();
++ dw.ToHost();
++ db.ToHost();
+ const float *dbptr = db.data<float>();
EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
- const float *dwptr = dw.data<const float *>();
+ const float *dwptr = dw.data<float>();
EXPECT_EQ(9u, dw.Size());
EXPECT_EQ(dy[3] * x[4], dwptr[0]);
EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_dropout.cc
index 32572d0,419dd0c..d06a254
--- a/test/singa/test_cudnn_dropout.cc
+++ b/test/singa/test_cudnn_dropout.cc
@@@ -48,8 -49,8 +49,8 @@@ TEST(CudnnDropout, Setup)
TEST(CudnnDropout, Forward) {
const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{n}, cuda);
in.CopyDataFromHostPtr(x, n);
float pdrop = 0.5;
@@@ -67,9 -68,9 +68,8 @@@
for (size_t i = 0; i < n; i++)
EXPECT_FLOAT_EQ(0, GetBitValue(mptr, i) * (GetBitValue(mptr, i) - 1));
-- singa::CppCPU host(0, 1);
-- out1.ToDevice(&host);
- const float* outptr1 = out1.data<const float*>();
++ out1.ToHost();
+ const float* outptr1 = out1.data<float>();
EXPECT_EQ(n, out1.Size());
float scale = 1.0f / (1.0f - pdrop);
// the output value should be 0 or the same as the input
@@@ -78,9 -79,9 +78,9 @@@
EXPECT_EQ(0.f, outptr1[7] * (outptr1[7] - scale * x[7]));
singa::Tensor out2 = drop.Forward(singa::kEval, in);
-- out2.ToDevice(&host);
++ out2.ToHost();
EXPECT_EQ(n, out2.Size());
- const float* outptr2 = out2.data<const float*>();
+ const float* outptr2 = out2.data<float>();
// the output value should be the same as the input
EXPECT_EQ(x[0], outptr2[0]);
EXPECT_EQ(x[1], outptr2[1]);
@@@ -90,8 -91,8 +90,8 @@@
TEST(CudnnDropout, Backward) {
const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{n}, cuda);
in.CopyDataFromHostPtr(x, n);
float pdrop = 0.5;
@@@ -105,14 -106,14 +105,13 @@@
singa::Tensor out1 = drop.Forward(singa::kTrain, in);
const float dy[] = {4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f};
-- singa::Tensor grad(singa::Shape{n}, &cuda);
++ singa::Tensor grad(singa::Shape{n}, cuda);
grad.CopyDataFromHostPtr(dy, n);
const auto ret = drop.Backward(singa::kTrain, grad);
-- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
-- in_grad.ToDevice(&host);
- const float* dx = in_grad.data<const float*>();
++ in_grad.ToHost();
+ const float* dx = in_grad.data<float>();
singa::Tensor mask(drop.mask().shape(), drop.mask().data_type());
mask.CopyData(drop.mask());
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_lrn.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_lrn.cc
index 390c588,f7ec046..4ee0c54
--- a/test/singa/test_cudnn_lrn.cc
+++ b/test/singa/test_cudnn_lrn.cc
@@@ -58,8 -58,8 +58,8 @@@ TEST(CudnnLRN, Forward)
0.0597329, -0.0530868, 0.0124246, 0.108429,
0.0451175, 0.0247055, 0.0304345, 0.0179575
};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{1,2,4,4}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
in.CopyDataFromHostPtr(x, 1*2*4*4);
singa::LayerConf conf;
@@@ -68,12 -68,12 +68,11 @@@
lrn_conf->set_local_size(3);
lrn_conf->set_alpha(0.1);
lrn_conf->set_beta(0.75);
- lrn.Setup(conf);
+ lrn.Setup(Shape{2, 4, 4}, conf);
singa::Tensor out = lrn.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out.ToDevice(&host);
- const float *outptr = out.data<const float *>();
++ out.ToHost();
+ const float *outptr = out.data<float>();
const auto & shape = out.shape();
EXPECT_EQ(4u, shape.size());
EXPECT_EQ(1u, shape[0]);
@@@ -128,8 -128,8 +127,8 @@@ TEST(CudnnLRN, Backward)
0.0597329, -0.0530868, 0.0124246, 0.108429,
0.0451175, 0.0247055, 0.0304345, 0.0179575
};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor x_tensor(singa::Shape{1,2,4,4}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
const float dy[] = {
@@@ -143,7 -143,7 +142,7 @@@
0.177807, 0.000892812, -0.00113197, 0.00327798
};
-- singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, &cuda);
++ singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, cuda);
dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
singa::LayerConf conf;
@@@ -156,10 -156,10 +155,9 @@@
lrn.Forward(singa::kTrain, x_tensor);
const auto ret = lrn.Backward(singa::kTrain, dy_tensor);
-- singa::CppCPU host(0, 1);
singa::Tensor dx = ret.first;
-- dx.ToDevice(&host);
- const float *dxptr = dx.data<const float *>();
++ dx.ToHost();
+ const float *dxptr = dx.data<float>();
const auto & shape = dx.shape();
EXPECT_EQ(4u, shape.size());
EXPECT_EQ(1u, shape[0]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_pooling.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_pooling.cc
index e66f212,2a98ab4..79051a3
--- a/test/singa/test_cudnn_pooling.cc
+++ b/test/singa/test_cudnn_pooling.cc
@@@ -58,8 -56,8 +56,8 @@@ TEST(CudnnPooling, Forward)
const size_t batchsize = 1, c = 1, h = 3, w = 3;
const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * h * w);
CudnnPooling pool;
@@@ -79,9 -74,9 +74,8 @@@
// Parameter "flag" does not influence pooling
singa::Tensor out1 = pool.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out1.ToDevice(&host);
- const float *outptr1 = out1.data<const float *>();
++ out1.ToHost();
+ const float *outptr1 = out1.data<float>();
// Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
EXPECT_EQ(4u, out1.Size());
EXPECT_EQ(5.0f, outptr1[0]);
@@@ -95,8 -90,8 +89,8 @@@ TEST(CudnnPooling, Backward)
const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
const float x[batchsize * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
CudnnPooling pool;
@@@ -119,14 -111,14 +110,13 @@@
// grad
const size_t grad_h = 2, grad_w = 2;
const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
-- singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, &cuda);
++ singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, cuda);
grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
const auto ret = pool.Backward(singa::kTrain, grad);
-- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
-- in_grad.ToDevice(&host);
- const float *dx = in_grad.data<const float *>();
++ in_grad.ToHost();
+ const float *dx = in_grad.data<float>();
EXPECT_EQ(9u, in_grad.Size());
EXPECT_EQ(0.0f, dx[0]);
EXPECT_EQ(0.0f, dx[1]);