You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/06/24 06:51:34 UTC
[1/6] incubator-singa git commit: SINGA-175 Add memory management
APIs and implement a subclass using CNMeM
Repository: incubator-singa
Updated Branches:
refs/heads/dev 9abd7910d -> dd08f4130
SINGA-175 Add memory management APIs and implement a subclass using CNMeM
Add CNMem as a submodule in lib/
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/683b3a76
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/683b3a76
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/683b3a76
Branch: refs/heads/dev
Commit: 683b3a76ec1bf3d44f32cefdd86123efa8b9c188
Parents: 01aaf49
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Jun 10 15:02:13 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Fri Jun 10 15:02:13 2016 +0800
----------------------------------------------------------------------
.gitmodules | 3 +++
lib/cnmem | 1 +
2 files changed, 4 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/683b3a76/.gitmodules
----------------------------------------------------------------------
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..cd0a9d2
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "lib/cnmem"]
+ path = lib/cnmem
+ url = https://github.com/NVIDIA/cnmem.git
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/683b3a76/lib/cnmem
----------------------------------------------------------------------
diff --git a/lib/cnmem b/lib/cnmem
new file mode 160000
index 0000000..28a182d
--- /dev/null
+++ b/lib/cnmem
@@ -0,0 +1 @@
+Subproject commit 28a182d49529da49f4ac4e3941cec3edf16b3540
[3/6] incubator-singa git commit: SINGA-175 Add memory management
APIs and implement a subclass using CNMeM
Posted by wa...@apache.org.
SINGA-175 Add memory management APIs and implement a subclass using CNMeM
Add base memory pool class.
Implement two subclasses, CnMemPool and CudaMemPool.
Add test for the memory pools.
TODO replace Device* to std::shared_ptr<Device> to avoid memory error because
the order of destructing device and tensor are dynamic (device may be freed
before tensors)
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/077d13e8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/077d13e8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/077d13e8
Branch: refs/heads/dev
Commit: 077d13e8052aa92679909b619966481a383a651f
Parents: ce3e6dc
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Wed Jun 22 20:26:41 2016 +0800
Committer: liyuchenmike@gmail.com <li...@gmail.com>
Committed: Wed Jun 22 20:26:41 2016 +0800
----------------------------------------------------------------------
CMakeLists.txt | 6 +-
include/singa/core/device.h | 5 ++
include/singa/core/memory.h | 46 +++++++++++++++
include/singa/model/loss.h | 2 +-
src/core/device/cuda_gpu.cc | 59 +++++++++++++++++--
src/core/memory/memory.cc | 69 ++++++++++++++++++++++
src/proto/core.proto | 13 +++++
test/singa/test_memory.cc | 111 ++++++++++++++++++++++++++++++++++++
test/singa/test_mse.cc | 13 ++++-
test/singa/test_tensor_math.cc | 4 ++
10 files changed, 319 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6240d2..c34b6ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,14 +13,15 @@ ENDIF()
#message(STATUS "${CMAKE_CXX_FLAGS}")
SET(SINGA_INCLUDE_DIR
- "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${PROJECT_BINARY_DIR}")
+ #"${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${CMAKE_SOURCE_DIR}/lib/cnmen/include;${PROJECT_BINARY_DIR}")
+ "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/include;${PROJECT_BINARY_DIR}")
#message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
#OPTION(CPU_ONLY "use GPU libs" OFF)
OPTION(USE_CBLAS "Use CBlas libs" ON)
OPTION(USE_CUDA "Use Cuda libs" ON)
-OPTION(USE_CUDNN "Use Cudnn libs" ON)
+OPTION(USE_CUDNN "Use Cudnn libs" OFF)
OPTION(USE_OPENCV "Use opencv" OFF)
OPTION(USE_LMDB "Use LMDB libs" OFF)
@@ -38,5 +39,6 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
ADD_SUBDIRECTORY(lib/cnmem)
+LIST(APPEND SINGA_LINKER_LIBS cnmem)
ADD_SUBDIRECTORY(src)
ADD_SUBDIRECTORY(test)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 8c95dc7..fc98a23 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -147,6 +147,8 @@ class CudaGPU : public Device {
~CudaGPU();
CudaGPU(int id = 0, int num_executors = 1, string scheduler = "sync",
string vm = "gc-only");
+ CudaGPU(const MemPoolConf& mem_conf,
+ int id = 0, int num_executors = 1, string scheduler = "sync");
void SetRandSeed(unsigned seed) override;
static void DeviceQuery();
@@ -180,6 +182,9 @@ class CudaGPU : public Device {
/// Free cpu memory.
void Free(void* ptr) override;
+
+ private:
+ DeviceMemPool* pool;
};
/// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/core/memory.h
----------------------------------------------------------------------
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index db09043..e4e1e63 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -19,10 +19,56 @@
#ifndef SINGA_CORE_MEMORY_H_
#define SINGA_CORE_MEMORY_H_
+#include "cnmem.h"
+#include <mutex>
+
namespace singa {
/// Manage device memory pool including garbage collection, memory opt.
class VirtualMemory {};
+class DeviceMemPool {
+ public:
+ virtual void InitPool() = 0;
+ virtual void Malloc(void** ptr, const size_t size) = 0;
+ virtual void Free(void* ptr) = 0;
+ virtual ~DeviceMemPool(){};
+};
+
+class CnMemPool : public DeviceMemPool {
+ public:
+ int status = 1;
+
+ void InitPool();
+
+ /// numDevices: total number of available GPU cards.
+ /// initSize: all devices will be allocated with this size
+ /// manager_flags: pool manager flag (one for all devices)
+ /// flag = 0; default flag
+ /// flag = 1: Prevent the manager from growing its memory consumption
+ /// flag = 2; Prevent the manager from stealing memory.
+ void InitPool(int numDevices, size_t initSize, unsigned flag);
+
+ void Malloc(void** ptr, const size_t size);
+ void Free(void* ptr);
+
+ // release all memory and set cnmem manager to unintialized
+ ~CnMemPool();
+
+ private:
+ // whether the (global) memory pool has been initialized
+ static bool initialized;
+ // lock on the initialized variable
+ static std::mutex mtx;
+};
+
+class CudaMemPool : public DeviceMemPool {
+ public:
+ void InitPool(){};
+ void Malloc(void** ptr, const size_t size);
+ void Free(void* ptr);
+ ~CudaMemPool(){};
+};
+
} // namespace singa
#endif // SINGA_CORE_MEMORY_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
index 6a23067..dcf0da4 100644
--- a/include/singa/model/loss.h
+++ b/include/singa/model/loss.h
@@ -35,7 +35,7 @@ class Loss {
loss.ParseFromString(conf);
Setup(loss);
}
-
+ virtual ~Loss(){};
/// Set meta fields from user configurations.
virtual void Setup(const LossConf& conf) {}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index a47f6fe..d9a0985 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -22,7 +22,7 @@
#include <cuda_runtime.h>
#include <curand.h>
#include <chrono>
-
+#include <iostream>
#include "singa/core/device.h"
#include "singa/utils/cuda_utils.h"
namespace singa {
@@ -42,6 +42,8 @@ CudaGPU::~CudaGPU() {
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
}
#endif
+ delete pool;
+ LOG(INFO) << "device has been deleted";
}
CudaGPU::CudaGPU(int id, int num_executors,
@@ -67,6 +69,48 @@ CudaGPU::CudaGPU(int id, int num_executors,
auto status = cudnnCreate(&ctx_.cudnn_handle);
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
#endif // USE_CUDNN
+
+ // initialize cnmem memory management as default
+ pool = new CnMemPool();
+ ((CnMemPool*)pool)->InitPool();
+}
+
+CudaGPU::CudaGPU(const MemPoolConf& mem_conf,int id, int num_executors,
+ string scheduler)
+ : Device(id, num_executors, scheduler, "gc-only") {
+ if (id == -1)
+ id = FindDevice(0);
+ lang_ = kCuda;
+ ctx_.stream = NULL; // use the default sync stream
+ // TODO(wangwei) create one handle for each steam?
+ CUDA_CHECK(cudaSetDevice(FindDevice(0)));
+ // use curandCreateGeneratorHost for CudaHost device
+ CURAND_CHECK(
+ curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+ auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+ SetRandSeed(seed);
+ // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+ CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+ CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
+
+#ifdef USE_CUDNN
+ // TODO(wangwei) create one handle for each stream?
+ auto status = cudnnCreate(&ctx_.cudnn_handle);
+ CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif // USE_CUDNN
+
+ // initialize memory management for cuda devices
+ string memoryPoolType = mem_conf.type();
+ if(memoryPoolType.compare("cnmem") == 0) {
+ pool = new CnMemPool();
+ int num_devices = mem_conf.num_devices();
+ size_t alloc_size = mem_conf.alloc_size();
+ unsigned flag = mem_conf.cnmemflag();
+ ((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
+ }
+ else {
+ pool = new CudaMemPool();
+ }
}
void CudaGPU::SetRandSeed(unsigned seed) {
@@ -90,7 +134,8 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
void* CudaGPU::Malloc(int size) {
void* ptr = nullptr;
if (size > 0) {
- CUDA_CHECK(cudaMalloc(&ptr, size));
+ //CUDA_CHECK(cudaMalloc((void**)&ptr,size));
+ pool->Malloc((void**)&ptr,size);
CUDA_CHECK(cudaMemset(ptr, 0, size));
}
return ptr;
@@ -98,8 +143,14 @@ void* CudaGPU::Malloc(int size) {
/// Free cpu memory.
void CudaGPU::Free(void* ptr) {
- if (ptr != nullptr)
- CUDA_CHECK(cudaFree(ptr));
+ LOG(INFO) << "Cuda free is called";
+ LOG(INFO) << "pool pointer" << pool << "\n";
+ LOG(INFO) << "pool status:" << ((CnMemPool*)pool)->status;
+ if (ptr != nullptr) {
+ //CUDA_CHECK(cudaFree(ptr));
+ pool->Free(ptr);
+ }
+ LOG(INFO) << "free memory is successed";
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index a1cf5db..c5878a6 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -18,3 +18,72 @@
#include "singa/core/memory.h"
+#include "singa/utils/logging.h"
+#include <iostream>
+
+namespace singa {
+
+bool singa::CnMemPool::initialized = false;
+std::mutex singa::CnMemPool::mtx;
+
+void CnMemPool::InitPool(int numDevices, size_t initSize, unsigned flag) {
+ mtx.lock();
+ if(!initialized) {
+ CHECK_GE(numDevices, 1);
+ cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
+ for(int i = 0; i < numDevices; i++) {
+ settingPtr[i].device = i;
+ settingPtr[i].size = initSize;
+ settingPtr[i].numStreams = 0;
+ settingPtr[i].streams = NULL;
+ settingPtr[i].streamSizes = 0;
+ }
+ cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
+ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+ delete[] settingPtr;
+ initialized = true;
+ }
+ mtx.unlock();
+}
+
+void CnMemPool::InitPool() {
+ int defaultNumDevices = 1;
+ size_t defaultSize = 1000000U;
+ InitPool(defaultNumDevices,defaultSize,cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
+}
+
+CnMemPool::~CnMemPool() {
+ mtx.lock();
+ if(initialized) {
+ cnmemStatus_t status = cnmemFinalize();
+ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+ initialized = false;
+ }
+ mtx.unlock();
+ LOG(INFO) << "cnmem has been freed";
+}
+
+
+void CnMemPool::Malloc(void** ptr, const size_t size) {
+ cnmemStatus_t status = cnmemMalloc(ptr,size,NULL);
+ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+}
+
+void CnMemPool::Free(void* ptr) {
+ LOG(INFO) << "cnmem free is called !!!!!!!!!!!";
+ cnmemStatus_t status = cnmemFree(ptr,NULL);
+ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+ LOG(INFO) << "cnmem free is terminated";
+}
+
+void CudaMemPool::Malloc(void** ptr, const size_t size) {
+ cudaError_t status = cudaMalloc(ptr,size);
+ CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+void CudaMemPool::Free(void* ptr) {
+ cudaError_t status = cudaFree(ptr);
+ CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index 88d7f12..cf6e193 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -44,3 +44,16 @@ enum CopyDirection {
kDeviceToDevice = 3;
kNumDirection = 4;
}
+
+// configuration for device memory pool
+message MemPoolConf {
+ optional string type = 1 [default = "cnmem"];
+ optional uint32 num_devices = 2 [default = 1];
+ // allocation size for each device
+ optional uint32 alloc_size = 3 [default = 10000000];
+ // memory manager flag for cnmem
+ // cnmemflag = 0: default flag
+ // cnmemflag = 1: prevent the manager from growing its memory consumption
+ // cnmemflag = 2: prevent the manager from stealing memory
+ optional uint32 cnmemflag = 4 [default = 0];
+}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
new file mode 100644
index 0000000..f5e464d
--- /dev/null
+++ b/test/singa/test_memory.cc
@@ -0,0 +1,111 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/utils/logging.h"
+#include "singa/core/memory.h"
+#include "singa/singa_config.h"
+#include <sys/time.h>
+
+#ifdef USE_CUDA
+TEST(CnmemPool, PoolInit) {
+ singa::CnMemPool pool;
+ pool.InitPool();
+}
+
+TEST(CnmemPool, PoolInitAll) {
+ singa::CnMemPool pool;
+ int nDevices;
+ cudaGetDeviceCount(&nDevices);
+ CHECK_GE(nDevices,1);
+ pool.InitPool(nDevices,1000000U,0);
+}
+
+TEST(CnmemPool, UsePool) {
+ singa::CnMemPool pool;
+ pool.InitPool();
+ int numOfTests = 10;
+ int numOfWriteVsRead = 3;
+ int allocSize = 1000000U;
+ for(int i = 0; i < numOfTests; i++) {
+ int** memPtrs = new int*[numOfWriteVsRead];
+ for(int j = 0; j < numOfWriteVsRead; j++) {
+ pool.Malloc((void**)(&memPtrs[j]), allocSize);
+ }
+ pool.Free(memPtrs[0]);
+ delete[] memPtrs;
+ }
+}
+
+TEST(CudaMemPool, UsePool) {
+ singa::CudaMemPool pool;
+ int numOfTests = 10;
+ int numOfWriteVsRead = 3;
+ int allocSize = 1000000U;
+ for(int i = 0; i < numOfTests; i++) {
+ int** memPtrs = new int*[numOfWriteVsRead];
+ for(int j = 0; j < numOfWriteVsRead; j++) {
+ pool.Malloc((void**)(&memPtrs[j]), allocSize);
+ }
+ pool.Free(memPtrs[0]);
+ delete[] memPtrs;
+ }
+}
+
+TEST(MemPool, CompareCudaCnmem) {
+ singa::CudaMemPool cudaPool;
+ singa::CnMemPool cnPool;
+ cnPool.InitPool();
+
+ int numOfTests = 10000;
+ int allocSize = 1000000U;
+ struct timeval start,end;
+ double t1,t2;
+
+ singa::DeviceMemPool* pool = NULL;
+ pool = &cnPool;
+
+ gettimeofday(&start,NULL);
+ for(int i = 0; i < numOfTests; i++) {
+ int* memPtrs = NULL;
+ pool->Malloc((void**)&memPtrs, allocSize);
+ pool->Free(memPtrs);
+ }
+ gettimeofday(&end,NULL);
+
+ t1 = start.tv_sec * 1000 + start.tv_usec/1000;
+ t2 = end.tv_sec * 1000 + end.tv_usec/1000;
+ LOG(INFO) << "cnmem time: " << t2-t1 << " ms" << std::endl;
+
+ pool = &cudaPool;
+ gettimeofday(&start,NULL);
+ for(int i = 0; i < numOfTests; i++) {
+ int* memPtrs = NULL;
+ pool->Malloc((void**)&memPtrs, allocSize);
+ pool->Free(memPtrs);
+ }
+ gettimeofday(&end,NULL);
+
+ t1 = start.tv_sec * 1000 + start.tv_usec/1000;
+ t2 = end.tv_sec * 1000 + end.tv_usec/1000;
+ LOG(INFO) << "cuda time: " << t2-t1 << " ms" << std::endl;
+}
+#endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 2c02273..7c6066e 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -68,11 +68,11 @@ TEST_F(TestMSE, CppBackward) {
#endif
#ifdef USE_CUDA
TEST_F(TestMSE, CudaForward) {
- singa::MSE mse;
+ singa::MSE* mse = new singa::MSE();
singa::CudaGPU dev;
p.ToDevice(&dev);
t.ToDevice(&dev);
- Tensor loss = mse.Forward(p, t);
+ Tensor loss = mse->Forward(p, t);
loss.ToHost();
auto ldat = loss.data<const float*>();
@@ -85,6 +85,12 @@ TEST_F(TestMSE, CudaForward) {
}
EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
}
+ LOG(INFO) << "Before delete pxxxxxxxxxxxxxxxxxxxxxxxx";
+ p.ToHost();
+ LOG(INFO) << "Before delete tyyyyyyyyyyyyyyyyyyyyyyy";
+ t.ToHost();
+ LOG(INFO) << "terminate-xxxxxxxxxxxxxxxxxx-";
+ delete mse;
}
TEST_F(TestMSE, CudaBackward) {
singa::MSE mse;
@@ -98,5 +104,8 @@ TEST_F(TestMSE, CudaBackward) {
for (size_t i = 0; i < grad.Size(); i++)
EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
+ p.ToHost();
+ t.ToHost();
+
}
#endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 170b96c..b18e465 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -302,6 +302,8 @@ TEST_F(TestTensorMath, MultCuda) {
EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
}
}
+ d.ToHost();
+ p.ToHost();
}
TEST_F(TestTensorMath, AddColumnCuda) {
@@ -479,6 +481,7 @@ TEST_F(TestTensorMath, SumRowsCuda) {
}
EXPECT_FLOAT_EQ(tptr[i], tmp);
}
+ d.ToHost();
}
TEST_F(TestTensorMath, SumColumnCuda) {
singa::CudaGPU dev;
@@ -495,5 +498,6 @@ TEST_F(TestTensorMath, SumColumnCuda) {
}
EXPECT_FLOAT_EQ(tptr[i], tmp);
}
+ d.ToHost();
}
#endif
[4/6] incubator-singa git commit: changed all device pointer to
shared pointer
Posted by wa...@apache.org.
changed all device pointer to shared pointer
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5651383f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5651383f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5651383f
Branch: refs/heads/dev
Commit: 5651383f5dbe0ab17eeda70f491d837a24bcb4ab
Parents: 077d13e
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Wed Jun 22 21:06:38 2016 +0800
Committer: liyuchenmike@gmail.com <li...@gmail.com>
Committed: Wed Jun 22 21:06:38 2016 +0800
----------------------------------------------------------------------
include/singa/core/device.h | 7 ++--
include/singa/core/tensor.h | 10 ++---
include/singa/model/layer.h | 2 +-
src/core/device/cpp_cpu.cc | 2 +-
src/core/device/cuda_gpu.cc | 5 ---
src/core/device/device.cc | 2 +-
src/core/memory/memory.cc | 3 --
src/core/tensor/tensor.cc | 19 +++++-----
src/model/layer/batchnorm.cc | 2 +-
src/model/layer/batchnorm.h | 2 +-
src/model/layer/dense.cc | 2 +-
src/model/layer/dense.h | 2 +-
src/model/layer/dropout.cc | 2 +-
src/model/layer/dropout.h | 2 +-
test/singa/test_dense.cc | 33 +++++++----------
test/singa/test_memory.cc | 6 +--
test/singa/test_mse.cc | 17 ++++-----
test/singa/test_sgd.cc | 8 ++--
test/singa/test_tensor.cc | 6 +--
test/singa/test_tensor_math.cc | 74 ++++++++++++++++++-------------------
20 files changed, 94 insertions(+), 112 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index fc98a23..d2b5b12 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -23,6 +23,7 @@
#include <vector>
#include <string>
#include <functional>
+#include <memory>
#include "singa/singa_config.h"
#include "singa/core/common.h"
#include "singa/core/memory.h"
@@ -75,7 +76,7 @@ class Device {
return lang_;
}
- Device* host() const { return host_;}
+ std::shared_ptr<Device> host() const { return host_;}
Context* context(int k) {
return &ctx_;
@@ -107,7 +108,7 @@ class Device {
// SafeQueue<Operation> op_queue_;
// SafeQueue<Operation> op_log_;
/// The host device
- Device* host_;
+ std::shared_ptr<Device> host_;
// TODO(wangwei) define multiple contexts, one per executor
Context ctx_;
};
@@ -134,7 +135,7 @@ class CppCPU : public Device {
};
/// a singleton CppDevice as the host for all devices.
-extern CppCPU defaultDevice;
+extern std::shared_ptr<Device> defaultDevice;
// Implement Device using OpenCL libs.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..8f73047 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -67,8 +67,8 @@ class Tensor {
Tensor();
explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
- Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
- Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
+ Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
+ Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(const Tensor &from);
@@ -80,7 +80,7 @@ class Tensor {
/// blob_ is allocated in constructors.
Blob *blob() const { return blob_; }
- Device *device() const { return device_; }
+ std::shared_ptr<Device> device() const { return device_; }
/// Return immutable Tensor values with given type.
template <typename DType>
@@ -125,7 +125,7 @@ class Tensor {
/// Reset the device.
/// If the target device is a diff device, then do deep data copy.
- void ToDevice(Device *dev);
+ void ToDevice(std::shared_ptr<Device> dev);
/// Equivalent to ToDevice(host_dev).
void ToHost();
@@ -192,7 +192,7 @@ class Tensor {
protected:
bool transpose_ = false;
DataType data_type_ = kFloat32;
- Device *device_ = nullptr;
+ std::shared_ptr<Device> device_ = nullptr;
/// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
/// If you want to get an allocated Blob, use blob() instead of blob_.
Blob *blob_ = nullptr;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 82c8edc..ee2b42b 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -125,7 +125,7 @@ class Layer {
/// Move the layer (including its parameters and other internal Tensor) onto
/// the given device
- virtual void ToDevice(Device* device) {
+ virtual void ToDevice(std::shared_ptr<Device> device) {
//for (auto p : param_values_) p->ToDevice(device);
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
index 44f614a..6884e35 100644
--- a/src/core/device/cpp_cpu.cc
+++ b/src/core/device/cpp_cpu.cc
@@ -17,7 +17,7 @@
*/
#include "singa/core/device.h"
namespace singa {
-CppCPU defaultDevice(-1, 1);
+std::shared_ptr<Device> defaultDevice=std::make_shared<CppCPU>(-1, 1);
CppCPU::CppCPU(int id, int num_executors, string scheduler,
string vm) : Device(id, num_executors, scheduler, vm) {
lang_ = kCpp;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index d9a0985..4da292f 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -43,7 +43,6 @@ CudaGPU::~CudaGPU() {
}
#endif
delete pool;
- LOG(INFO) << "device has been deleted";
}
CudaGPU::CudaGPU(int id, int num_executors,
@@ -143,14 +142,10 @@ void* CudaGPU::Malloc(int size) {
/// Free cpu memory.
void CudaGPU::Free(void* ptr) {
- LOG(INFO) << "Cuda free is called";
- LOG(INFO) << "pool pointer" << pool << "\n";
- LOG(INFO) << "pool status:" << ((CnMemPool*)pool)->status;
if (ptr != nullptr) {
//CUDA_CHECK(cudaFree(ptr));
pool->Free(ptr);
}
- LOG(INFO) << "free memory is successed";
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 1d3c446..1889339 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -22,7 +22,7 @@ namespace singa {
Device::Device(int id, int num_executors, string scheduler, string vm)
: id_(id), num_executors_(num_executors) {
// TODO(wangwei) create scheduler and vm.
- host_ = &defaultDevice;
+ host_ = defaultDevice;
}
void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index c5878a6..304c101 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -60,7 +60,6 @@ CnMemPool::~CnMemPool() {
initialized = false;
}
mtx.unlock();
- LOG(INFO) << "cnmem has been freed";
}
@@ -70,10 +69,8 @@ void CnMemPool::Malloc(void** ptr, const size_t size) {
}
void CnMemPool::Free(void* ptr) {
- LOG(INFO) << "cnmem free is called !!!!!!!!!!!";
cnmemStatus_t status = cnmemFree(ptr,NULL);
CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
- LOG(INFO) << "cnmem free is terminated";
}
void CudaMemPool::Malloc(void** ptr, const size_t size) {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..a5b43d8 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -25,29 +25,28 @@
namespace singa {
Tensor::~Tensor() {
- // LOG(ERROR) << "~";
if (blob_ != nullptr && blob_->DecRefCount() == 0)
device_->FreeBlob(blob_);
blob_ = nullptr;
}
-Tensor::Tensor() { device_ = &defaultDevice; }
+Tensor::Tensor() { device_ = defaultDevice; }
Tensor::Tensor(const Shape &shape, DataType dtype)
- : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
- device_ = &defaultDevice;
+ : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+ device_ = defaultDevice;
blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
}
Tensor::Tensor(Shape &&shape, DataType dtype)
- : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
- device_ = &defaultDevice;
+ : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+ device_ = defaultDevice;
blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
+Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
+Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
}
@@ -104,7 +103,7 @@ void Tensor::AsType(DataType type) {
}
}
-void Tensor::ToDevice(Device *dst) {
+void Tensor::ToDevice(std::shared_ptr<Device> dst) {
// TODO(wangwei) the comparison is very strict. May compare against device ID?
if (device_ != dst) {
Tensor tmp(shape_, dst, data_type_);
@@ -234,7 +233,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
CHECK_GE(src.MemSize(), src_offset + nBytes);
CHECK_GE(dst->MemSize(), dst_offset + nBytes);
- Device *src_dev = src.device(), *dst_dev = dst->device();
+ std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
Blob *from = src.blob(), *to = dst->blob();
if (dst_dev->lang() != src_dev->lang()) {
// let the none cpp device conduct copy op
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/batchnorm.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.cc b/src/model/layer/batchnorm.cc
index bcd0870..1e6c39b 100644
--- a/src/model/layer/batchnorm.cc
+++ b/src/model/layer/batchnorm.cc
@@ -44,7 +44,7 @@ void BatchNorm::Setup(const LayerConf& conf) {
param_values_.push_back(&runningVariance_);
}
-void BatchNorm::ToDevice(Device* device) {
+void BatchNorm::ToDevice(std::shared_ptr<Device> device) {
bnScale_.ToDevice(device);
bnBias_.ToDevice(device);
dbnScale_.ToDevice(device);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.h b/src/model/layer/batchnorm.h
index 0255179..83f143d 100644
--- a/src/model/layer/batchnorm.h
+++ b/src/model/layer/batchnorm.h
@@ -67,7 +67,7 @@ class BatchNorm : public Layer {
runningVariance_.ResetLike(x);
runningVariance_.CopyData(x);
}
- virtual void ToDevice(Device* device) override;
+ virtual void ToDevice(std::shared_ptr<Device> device) override;
protected:
float factor_;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index b349787..d47c1db 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -79,7 +79,7 @@ const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag,
return std::make_pair(dx, param_grad);
}
-void Dense::ToDevice(Device *device) {
+void Dense::ToDevice(std::shared_ptr<Device> device) {
weight_.ToDevice(device);
bias_.ToDevice(device);
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dense.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.h b/src/model/layer/dense.h
index a5a6f66..49cb986 100644
--- a/src/model/layer/dense.h
+++ b/src/model/layer/dense.h
@@ -40,7 +40,7 @@ class Dense : public Layer {
const std::pair<Tensor, vector<Tensor>> Backward(int flag,
const Tensor& grad) override;
- void ToDevice(Device* device) override;
+ void ToDevice(std::shared_ptr<Device> device) override;
size_t num_output() const { return hdim_; }
size_t num_input() const { return vdim_; }
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.cc b/src/model/layer/dropout.cc
index c2c97be..695008e 100644
--- a/src/model/layer/dropout.cc
+++ b/src/model/layer/dropout.cc
@@ -52,7 +52,7 @@ const std::pair<Tensor, vector<Tensor>> Dropout::Backward(int flag,
return std::make_pair(input_grad, param_grad);
}
-void Dropout::ToDevice(Device* device) {
+void Dropout::ToDevice(std::shared_ptr<Device> device) {
Layer::ToDevice(device);
mask_.ToDevice(device);
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.h b/src/model/layer/dropout.h
index 5efaf6a..d5da79c 100644
--- a/src/model/layer/dropout.h
+++ b/src/model/layer/dropout.h
@@ -43,7 +43,7 @@ class Dropout : public Layer {
const std::pair<Tensor, vector<Tensor>> Backward(int flag,
const Tensor& grad) override;
- void ToDevice(Device* device) override;
+ void ToDevice(std::shared_ptr<Device> device) override;
float dropout_ratio() const {
return dropout_ratio_;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index 052d0e8..7ed4d33 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -66,7 +66,6 @@ TEST(Dense, ForwardCpp) {
dense.set_bias(bias);
singa::Tensor out1 = dense.Forward(singa::kTrain, in);
- singa::CppCPU host(0, 1);
const float *outptr1 = out1.data<const float *>();
EXPECT_EQ(9u, out1.Size());
for (int i = 0; i < 3; i++)
@@ -76,7 +75,6 @@ TEST(Dense, ForwardCpp) {
outptr1[i * 3 + j]);
}
#endif // USE_CBLAS
-#ifdef USE_CUDA
TEST(Dense, BackwardCpp) {
Dense dense;
@@ -89,7 +87,6 @@ TEST(Dense, BackwardCpp) {
const size_t batchsize = 3, vdim = 2, hdim = 3;
const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
- singa::CudaGPU cuda(0, 1);
singa::Tensor in(singa::Shape{batchsize, vdim});
in.CopyDataFromHostPtr(x, batchsize * vdim);
@@ -114,7 +111,6 @@ TEST(Dense, BackwardCpp) {
grad.CopyDataFromHostPtr(dy, batchsize * hdim);
const auto ret = dense.Backward(singa::kTrain, grad);
- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
singa::Tensor dweight = ret.second.at(0);
singa::Tensor dbias = ret.second.at(1);
@@ -139,7 +135,6 @@ TEST(Dense, BackwardCpp) {
for (int i = 0; i < 3; i++)
EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
}
-#endif
#ifdef USE_CUDA
TEST(Dense, ForwardCuda) {
@@ -154,25 +149,24 @@ TEST(Dense, ForwardCuda) {
const size_t batchsize = 3, vdim = 2, hdim = 3;
const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+ singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
in.CopyDataFromHostPtr(x, batchsize * vdim);
// set weight
const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
- singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+ singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
weight.CopyDataFromHostPtr(we, hdim * vdim);
const float bia[hdim] = {1.0f, 1.0f, 1.0f};
- singa::Tensor bias(singa::Shape{hdim}, &cuda);
+ singa::Tensor bias(singa::Shape{hdim}, cuda);
bias.CopyDataFromHostPtr(bia, hdim);
dense.set_weight(weight);
dense.set_bias(bias);
singa::Tensor out1 = dense.Forward(singa::kTrain, in);
- singa::CppCPU host(0, 1);
- out1.ToDevice(&host);
+ out1.ToHost();
const float *outptr1 = out1.data<const float *>();
EXPECT_EQ(9u, out1.Size());
for (int i = 0; i < 3; i++)
@@ -193,17 +187,17 @@ TEST(Dense, BackwardCuda) {
const size_t batchsize = 3, vdim = 2, hdim = 3;
const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+ singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
in.CopyDataFromHostPtr(x, batchsize * vdim);
// set weight
const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
- singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+ singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
weight.CopyDataFromHostPtr(we, hdim * vdim);
const float bia[hdim] = {1.0f, 1.0f, 1.0f};
- singa::Tensor bias(singa::Shape{hdim}, &cuda);
+ singa::Tensor bias(singa::Shape{hdim}, cuda);
bias.CopyDataFromHostPtr(bia, hdim);
dense.set_weight(weight);
@@ -214,15 +208,14 @@ TEST(Dense, BackwardCuda) {
// grad
const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
2.0f, 3.0f, 3.0f, 3.0f};
- singa::Tensor grad(singa::Shape{batchsize, hdim}, &cuda);
+ singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
grad.CopyDataFromHostPtr(dy, batchsize * hdim);
const auto ret = dense.Backward(singa::kTrain, grad);
- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
singa::Tensor dweight = ret.second.at(0);
singa::Tensor dbias = ret.second.at(1);
- in_grad.ToDevice(&host);
+ in_grad.ToHost();
const float *dx = in_grad.data<const float *>();
EXPECT_EQ(6u, in_grad.Size());
for (int i = 0; i < 3; i++)
@@ -231,7 +224,7 @@ TEST(Dense, BackwardCuda) {
(dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
dy[i * 3 + 2] * we[2 * 2 + j]),
dx[i * 2 + j]);
- dweight.ToDevice(&host);
+ dweight.ToHost();
const float *dweightx = dweight.data<const float *>();
EXPECT_EQ(6u, dweight.Size());
for (int i = 0; i < 3; i++)
@@ -240,7 +233,7 @@ TEST(Dense, BackwardCuda) {
(dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
dy[2 * 3 + i] * x[2 * 2 + j]),
dweightx[i * 2 + j]);
- dbias.ToDevice(&host);
+ dbias.ToHost();
const float *dbiasx = dbias.data<const float *>();
EXPECT_EQ(3u, dbias.Size());
for (int i = 0; i < 3; i++)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
index f5e464d..90fc99a 100644
--- a/test/singa/test_memory.cc
+++ b/test/singa/test_memory.cc
@@ -75,7 +75,7 @@ TEST(MemPool, CompareCudaCnmem) {
singa::CnMemPool cnPool;
cnPool.InitPool();
- int numOfTests = 10000;
+ int numOfTests = 5000;
int allocSize = 1000000U;
struct timeval start,end;
double t1,t2;
@@ -93,7 +93,7 @@ TEST(MemPool, CompareCudaCnmem) {
t1 = start.tv_sec * 1000 + start.tv_usec/1000;
t2 = end.tv_sec * 1000 + end.tv_usec/1000;
- LOG(INFO) << "cnmem time: " << t2-t1 << " ms" << std::endl;
+ LOG(INFO) << "cnmem memory time: " << t2-t1 << " ms" << std::endl;
pool = &cudaPool;
gettimeofday(&start,NULL);
@@ -106,6 +106,6 @@ TEST(MemPool, CompareCudaCnmem) {
t1 = start.tv_sec * 1000 + start.tv_usec/1000;
t2 = end.tv_sec * 1000 + end.tv_usec/1000;
- LOG(INFO) << "cuda time: " << t2-t1 << " ms" << std::endl;
+ LOG(INFO) << "cuda memory time: " << t2-t1 << " ms" << std::endl;
}
#endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 7c6066e..d2c5125 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -69,9 +69,9 @@ TEST_F(TestMSE, CppBackward) {
#ifdef USE_CUDA
TEST_F(TestMSE, CudaForward) {
singa::MSE* mse = new singa::MSE();
- singa::CudaGPU dev;
- p.ToDevice(&dev);
- t.ToDevice(&dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ p.ToDevice(dev);
+ t.ToDevice(dev);
Tensor loss = mse->Forward(p, t);
loss.ToHost();
@@ -85,18 +85,15 @@ TEST_F(TestMSE, CudaForward) {
}
EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
}
- LOG(INFO) << "Before delete pxxxxxxxxxxxxxxxxxxxxxxxx";
p.ToHost();
- LOG(INFO) << "Before delete tyyyyyyyyyyyyyyyyyyyyyyy";
t.ToHost();
- LOG(INFO) << "terminate-xxxxxxxxxxxxxxxxxx-";
- delete mse;
}
+
TEST_F(TestMSE, CudaBackward) {
singa::MSE mse;
- singa::CudaGPU dev;
- p.ToDevice(&dev);
- t.ToDevice(&dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ p.ToDevice(dev);
+ t.ToDevice(dev);
mse.Forward(p, t);
Tensor grad = mse.Backward();
grad.ToHost();
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_sgd.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
index 71ab15e..3b04ab6 100644
--- a/test/singa/test_sgd.cc
+++ b/test/singa/test_sgd.cc
@@ -88,8 +88,8 @@ TEST(SGD, ApplyWithoutMomentumCuda) {
const float v[4] = {0.1, 0.2, 0.3, 0.4};
const float g[4] = {0.1, 0.1, 0.1, 0.1};
- singa::CudaGPU dev;
- singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
value.CopyDataFromHostPtr(v, 4);
grad.CopyDataFromHostPtr(g, 4);
@@ -124,8 +124,8 @@ TEST(SGD, ApplyWithMomentumCuda) {
const float v[4] = {0.1, 0.2, 0.3, 0.4};
const float g[4] = {0.01, 0.02, 0.03, 0.04};
- singa::CudaGPU dev;
- singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
value.CopyDataFromHostPtr(v, 4);
grad.CopyDataFromHostPtr(g, 4);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index bd039ad..c351174 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -59,10 +59,10 @@ TEST(TensorClass, AsType) {
TEST(TensorClass, ToDevice) {
Tensor t(Shape{2,3});
- EXPECT_EQ(static_cast<Device*>(&singa::defaultDevice), t.device());
- singa::CppCPU *dev = new singa::CppCPU(0, 1);
+ EXPECT_EQ(singa::defaultDevice, t.device());
+ auto dev = std::make_shared<singa::CppCPU>(0, 1);
t.ToDevice(dev);
- EXPECT_NE(static_cast<Device*>(&singa::defaultDevice), t.device());
+ EXPECT_NE(singa::defaultDevice, t.device());
}
TEST(TensorClass, CopyDataFromHostPtr) {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index b18e465..0f998c0 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -255,10 +255,10 @@ TEST_F(TestTensorMath, SumColumnsCpp) {
#ifdef USE_CUDA
TEST_F(TestTensorMath, MultCuda) {
const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
- singa::CudaGPU dev;
- Tensor t(Shape{2, 2}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2, 2}, dev);
t.CopyDataFromHostPtr(x, 4);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
d.CopyDataFromHostPtr(dat1, 6);
Tensor C = Mult(d, t);
C.ToHost();
@@ -274,7 +274,7 @@ TEST_F(TestTensorMath, MultCuda) {
}
const float y[8] = {1.0f, 2.0f, 3.0f, 4.0f, 1.1f, 2.1f, 3.1f, 4.1f};
- Tensor s(Shape{4, 2}, &dev);
+ Tensor s(Shape{4, 2}, dev);
s.CopyDataFromHostPtr(y, 8);
Tensor D = Mult(d, s.T());
D.ToHost();
@@ -288,11 +288,11 @@ TEST_F(TestTensorMath, MultCuda) {
EXPECT_FLOAT_EQ(DPtr[i * 4 + j], tmp);
}
}
- Tensor p(Shape{4, 1}, &dev);
+ Tensor p(Shape{4, 1}, dev);
p.CopyDataFromHostPtr(x, 4);
- Tensor q(Shape{1, 4}, &dev);
+ Tensor q(Shape{1, 4}, dev);
q.SetValue(1.0f);
- Tensor o(Shape{4, 4}, &dev);
+ Tensor o(Shape{4, 4}, dev);
Mult(p, q, &o);
o.ToHost();
@@ -308,11 +308,11 @@ TEST_F(TestTensorMath, MultCuda) {
TEST_F(TestTensorMath, AddColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
t.CopyDataFromHostPtr(x, 3);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
AddColumn(t, &d);
d.ToHost();
const float *xptr = d.data<const float *>();
@@ -326,11 +326,11 @@ TEST_F(TestTensorMath, AddColumnCuda) {
TEST_F(TestTensorMath, SubColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
t.CopyDataFromHostPtr(x, 3);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
SubColumn(t, &d);
d.ToHost();
const float *xptr = d.data<const float *>();
@@ -357,11 +357,11 @@ TEST_F(TestTensorMath, MultColumnCpp) {
#ifdef USE_CUDA
TEST_F(TestTensorMath, MultColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
t.CopyDataFromHostPtr(x, 3);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
MultColumn(t, &d);
d.ToHost();
const float *xptr = d.data<const float *>();
@@ -373,11 +373,11 @@ TEST_F(TestTensorMath, MultColumnCuda) {
}
TEST_F(TestTensorMath, DivColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
t.CopyDataFromHostPtr(x, 3);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
DivColumn(t, &d);
d.ToHost();
const float *xptr = d.data<const float *>();
@@ -389,11 +389,11 @@ TEST_F(TestTensorMath, DivColumnCuda) {
}
TEST_F(TestTensorMath, AddRowCuda) {
const float x[2] = {1.1f, 2.1f};
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
t.CopyDataFromHostPtr(x, 2);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
AddRow(t, &d);
d.ToHost();
const float *xptr = d.data<const float *>();
@@ -405,11 +405,11 @@ TEST_F(TestTensorMath, AddRowCuda) {
}
TEST_F(TestTensorMath, SubRowCuda) {
const float x[2] = {1.1f, 2.1f};
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
t.CopyDataFromHostPtr(x, 2);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
SubRow(t, &d);
d.ToHost();
const float *xptr = d.data<const float *>();
@@ -421,11 +421,11 @@ TEST_F(TestTensorMath, SubRowCuda) {
}
TEST_F(TestTensorMath, MultRowCuda) {
const float x[2] = {1.1f, 2.1f};
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
t.CopyDataFromHostPtr(x, 2);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
MultRow(t, &d);
d.ToHost();
const float *xptr = d.data<const float *>();
@@ -452,11 +452,11 @@ TEST_F(TestTensorMath, DivRowCpp) {
#ifdef USE_CUDA
TEST_F(TestTensorMath, DivRowCuda) {
const float x[2] = {1.1f, 2.1f};
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
t.CopyDataFromHostPtr(x, 2);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
DivRow(t, &d);
d.ToHost();
const float *xptr = d.data<const float *>();
@@ -467,10 +467,10 @@ TEST_F(TestTensorMath, DivRowCuda) {
}
}
TEST_F(TestTensorMath, SumRowsCuda) {
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
SumRows(d, &t);
t.ToHost();
const float *tptr = t.data<const float *>();
@@ -484,10 +484,10 @@ TEST_F(TestTensorMath, SumRowsCuda) {
d.ToHost();
}
TEST_F(TestTensorMath, SumColumnCuda) {
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
SumColumns(d, &t);
t.ToHost();
const float *tptr = t.data<const float *>();
[2/6] incubator-singa git commit: SINGA-197 Add CNMem as a submodule
in lib/
Posted by wa...@apache.org.
SINGA-197 Add CNMem as a submodule in lib/
Compile CNMem library before building singa. A "libcnmem.so" file is generated in BUILD_PATH/lib.
Now it is compatible with low version of cmake.
The cuda sources will not be compiled when USE_CUDA is OFF.
Move singa_config.h to BUILD_PATH/include/singa.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ce3e6dc1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ce3e6dc1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ce3e6dc1
Branch: refs/heads/dev
Commit: ce3e6dc102964ec6ed782cbdd6cb6ca30b41a4db
Parents: 683b3a7
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Tue Jun 14 15:45:45 2016 +0800
Committer: xiezl <xi...@comp.nus.edu.sg>
Committed: Tue Jun 14 15:45:45 2016 +0800
----------------------------------------------------------------------
CMakeLists.txt | 6 ++++--
cmake/Cuda.cmake | 1 -
cmake/Dependencies.cmake | 4 ++--
cmake/Protobuf.cmake | 28 ++++++++++++++++++++++++++++
include/singa/core/common.h | 2 +-
include/singa/core/device.h | 2 +-
include/singa/utils/cuda_utils.h | 2 +-
src/CMakeLists.txt | 19 +++++++++++++------
src/core/device/cuda_gpu.cc | 2 +-
src/core/tensor/math_kernel.cu | 2 +-
src/core/tensor/math_kernel.h | 2 +-
src/core/tensor/tensor_math_cuda.h | 2 +-
src/model/layer/cudnn_activation.cc | 2 +-
src/model/layer/cudnn_activation.h | 2 +-
src/model/layer/cudnn_batchnorm.h | 2 +-
src/model/layer/cudnn_convolution.h | 2 +-
src/model/layer/cudnn_dropout.h | 2 +-
src/model/layer/cudnn_lrn.h | 2 +-
src/model/layer/cudnn_pooling.h | 2 +-
src/model/layer/cudnn_softmax.cc | 2 +-
src/model/layer/cudnn_utils.h | 2 +-
test/singa/test_cudnn_activation.cc | 2 +-
test/singa/test_cudnn_softmax.cc | 2 +-
test/singa/test_dense.cc | 2 +-
test/singa/test_mse.cc | 2 +-
test/singa/test_sgd.cc | 2 +-
26 files changed, 68 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbe3adc..f6240d2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,8 @@ IF(UNIX OR APPLE)
ENDIF()
#message(STATUS "${CMAKE_CXX_FLAGS}")
-SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
+SET(SINGA_INCLUDE_DIR
+ "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${PROJECT_BINARY_DIR}")
#message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
@@ -28,7 +29,7 @@ ADD_DEFINITIONS(-DUSE_CMAKE)
CONFIGURE_FILE (
"${PROJECT_SOURCE_DIR}/cmake/Templates/singa_config.h.in"
- "${PROJECT_BINARY_DIR}/singa_config.h")
+ "${PROJECT_BINARY_DIR}/include/singa/singa_config.h")
#set(SINGA_CONFIGURE_SRC "${PROJECT_BINARY_DIR}/singa_config.h")
#LIST(APPEND SRCS ${SINGA_CONFIGURE_SRCS} ${PROJECT_BINARY_DIR}/singa_config.h)
@@ -36,5 +37,6 @@ CONFIGURE_FILE (
SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
+ADD_SUBDIRECTORY(lib/cnmem)
ADD_SUBDIRECTORY(src)
ADD_SUBDIRECTORY(test)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index a74c82b..e7af7c9 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -10,7 +10,6 @@ MESSAGE(STATUS "Found cuda_v${CUDA_VERSION}")
#ADD_DEFINITIONS(-DUSE_CUDA)
#message(STATUS "linking: ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}")
-
IF(USE_CUDNN)
#include(cmake/Modules/Cudnn.cmake)
FIND_PACKAGE(CUDNN REQUIRED)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index d3f0b00..fe178d6 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -6,6 +6,7 @@ FIND_PACKAGE( Protobuf REQUIRED )
INCLUDE_DIRECTORIES(SYSTEM ${PROTOBUF_INCLUDE_DIR})
MESSAGE(STATUS "proto libs " ${PROTOBUF_LIBRARIES})
LIST(APPEND singa_linker_libs ${PROTOBUF_LIBRARIES})
+INCLUDE("cmake/Protobuf.cmake")
IF(USE_LMDB)
FIND_PACKAGE(LMDB REQUIRED)
@@ -14,10 +15,9 @@ IF(USE_LMDB)
MESSAGE(STATUS "FOUND lmdb at ${LMDB_INCLUDE_DIR}")
ENDIF()
-IF(NOT CPU_ONLY)
+IF(USE_CUDA)
INCLUDE("cmake/Cuda.cmake")
ELSE()
- SET(USE_CUDA FALSE)
SET(USE_CUDNN FALSE)
ENDIF()
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/cmake/Protobuf.cmake
----------------------------------------------------------------------
diff --git a/cmake/Protobuf.cmake b/cmake/Protobuf.cmake
new file mode 100644
index 0000000..c72b2c0
--- /dev/null
+++ b/cmake/Protobuf.cmake
@@ -0,0 +1,28 @@
+# copy from cmake source code
+function(PROTOBUF_GENERATE_PYTHON OUTPUT)
+ if(NOT ARGN)
+ message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called
+ without any proto files")
+ return()
+ endif(NOT ARGN)
+
+ set(${OUTPUT})
+ foreach(FIL ${ARGN})
+ get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+ get_filename_component(FIL_WE ${FIL} NAME_WE)
+ get_filename_component(PATH ${FIL} PATH)
+
+ list(APPEND ${OUTPUT} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+
+ add_custom_command(
+ OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+ COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+ ARGS --python_out ${CMAKE_CURRENT_BINARY_DIR}
+ --proto_path ${PATH} ${ABS_FIL}
+ DEPENDS ${ABS_FIL}
+ COMMENT "Running Python protocol buffer compiler on ${FIL}" VERBATIM)
+ endforeach()
+
+ set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+ set(${OUTPUT} ${${OUTPUT}} PARENT_SCOPE)
+endfunction()
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index e6f4c90..e19022e 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -20,7 +20,7 @@
#define SINGA_CORE_COMMON_H_
#include <random>
#include <chrono>
-#include "./singa_config.h"
+#include "./singa/singa_config.h"
#include "singa/utils/logging.h"
#ifdef USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 56eda70..8c95dc7 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -23,7 +23,7 @@
#include <vector>
#include <string>
#include <functional>
-#include "singa_config.h"
+#include "singa/singa_config.h"
#include "singa/core/common.h"
#include "singa/core/memory.h"
#include "singa/core/scheduler.h"
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/include/singa/utils/cuda_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
index 17eb683..24f3eb9 100644
--- a/include/singa/utils/cuda_utils.h
+++ b/include/singa/utils/cuda_utils.h
@@ -2,7 +2,7 @@
#ifndef SINGA_UTILS_CUDA_UTILS_H_
#define SINGA_UTILS_CUDA_UTILS_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDA
#include <cublas_v2.h>
#include <cuda.h>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 952f7ee..4949236 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -31,15 +31,22 @@ AUX_SOURCE_DIRECTORY(core/device core_source)
AUX_SOURCE_DIRECTORY(core/memory core_source)
AUX_SOURCE_DIRECTORY(core/scheduler core_source)
AUX_SOURCE_DIRECTORY(core/tensor core_source)
-FILE(GLOB_RECURSE cuda_source core "*.cu")
-SET(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
-SET(CMAKE_CXX_FLAGS "")
-CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
+IF (USE_CUDA)
+ FILE(GLOB_RECURSE cuda_source core "*.cu")
+ SET(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
+ SET(CMAKE_CXX_FLAGS "")
+ IF (CMAKE_BUILD_TYPE MATCHES DEBUG)
+ CUDA_COMPILE(cuda_objs SHARED ${cuda_source}
+ OPTIONS "-Xcompiler -fPIC -G -g")
+ ELSE (CMAKE_BUILD_TYPE MATCHES DEBUG)
+ CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
+ ENDIF (CMAKE_BUILD_TYPE MATCHES DEBUG)
+ include_directories("${CMAKE_CURRENT_SOURCE_DIR}/core/tensor")
+ SET(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
+ENDIF (USE_CUDA)
#message(STATUS "FLAGS ${CMAKE_CXX_FLAGS}")
#message(STATUS "CORE ${cuda_source}")
#message(STATUS "OBJ ${cuda_objs}")
-include_directories("${CMAKE_CURRENT_SOURCE_DIR}/core/tensor")
-SET(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
ADD_LIBRARY(singa_core SHARED ${core_source} ${cuda_objs})
TARGET_LINK_LIBRARIES(singa_core ${SINGA_LINKER_LIBS})
LIST(APPEND SINGA_LINKER_LIBS singa_core)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 5d4e1ed..a47f6fe 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -15,7 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDA
#include <cublas_v2.h>
#include <cuda.h>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index aed6add..cc84e5c 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -19,7 +19,7 @@
*
*************************************************************/
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDA
#include <cmath>
#include <algorithm>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5c906a9..4f13a5b 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -22,7 +22,7 @@
#define SRC_CORE_TENSOR__MATH_KERNEL_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDA
/// TODO(wangwei) Clean the function APIs as commented in tensor_math.h
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 4a2ba66..6693644 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -18,7 +18,7 @@
#ifndef SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
#define SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDA
#include "./tensor_math.h"
#include "./math_kernel.h"
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.cc b/src/model/layer/cudnn_activation.cc
index 8ecbbc7..b924494 100644
--- a/src/model/layer/cudnn_activation.cc
+++ b/src/model/layer/cudnn_activation.cc
@@ -15,7 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include "./cudnn_activation.h"
#include <cudnn.h>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_activation.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.h b/src/model/layer/cudnn_activation.h
index b572db7..1483e48 100644
--- a/src/model/layer/cudnn_activation.h
+++ b/src/model/layer/cudnn_activation.h
@@ -18,7 +18,7 @@
#ifndef SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
#define SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include <cudnn.h>
#include <utility>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_batchnorm.h b/src/model/layer/cudnn_batchnorm.h
index 83258d2..8598b65 100644
--- a/src/model/layer/cudnn_batchnorm.h
+++ b/src/model/layer/cudnn_batchnorm.h
@@ -20,7 +20,7 @@
************************************************************/
#ifndef SINGA_MODEL_LAYER_CUDNN_BATCHNORM_H
#define SINGA_MODEL_LAYER_CUDNN_BATCHNORM_H
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include "batchnorm.h"
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.h b/src/model/layer/cudnn_convolution.h
index b86c576..152d797 100644
--- a/src/model/layer/cudnn_convolution.h
+++ b/src/model/layer/cudnn_convolution.h
@@ -18,7 +18,7 @@
#ifndef SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
#define SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include <string>
#include <utility>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index 7cb185b..da3d1d2 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -18,7 +18,7 @@
#ifndef SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
#define SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
// cudnn dropout is added in cudnn 5
#if CUDNN_VERSION_MAJOR >= 5
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_lrn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_lrn.h b/src/model/layer/cudnn_lrn.h
index 0f650fe..cd3bcf1 100644
--- a/src/model/layer/cudnn_lrn.h
+++ b/src/model/layer/cudnn_lrn.h
@@ -20,7 +20,7 @@
************************************************************/
#ifndef SINGA_MODEL_LAYER_CUDNN_LRN_H_
#define SINGA_MODEL_LAYER_CUDNN_LRN_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include "lrn.h"
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.h b/src/model/layer/cudnn_pooling.h
index 1a38cd5..a5936ad 100644
--- a/src/model/layer/cudnn_pooling.h
+++ b/src/model/layer/cudnn_pooling.h
@@ -18,7 +18,7 @@
#ifndef SRC_MODEL_LAYER_CUDNN_POOLING_H_
#define SRC_MODEL_LAYER_CUDNN_POOLING_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include <cudnn.h>
#include <string>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.cc b/src/model/layer/cudnn_softmax.cc
index 85b0c3d..d557062 100644
--- a/src/model/layer/cudnn_softmax.cc
+++ b/src/model/layer/cudnn_softmax.cc
@@ -15,7 +15,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-#include "singa_config.h"
+#include "singa/singa_config.h"
#include "./cudnn_softmax.h"
#ifdef USE_CUDNN
#include <cudnn.h>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_utils.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_utils.h b/src/model/layer/cudnn_utils.h
index 039a1ac..19c72ec 100644
--- a/src/model/layer/cudnn_utils.h
+++ b/src/model/layer/cudnn_utils.h
@@ -18,7 +18,7 @@
#ifndef SRC_MODEL_LAYER_CUDNN_UTILS_H_
#define SRC_MODEL_LAYER_CUDNN_UTILS_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include <cudnn.h>
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_activation.cc b/test/singa/test_cudnn_activation.cc
index 892b80b..bed7715 100644
--- a/test/singa/test_cudnn_activation.cc
+++ b/test/singa/test_cudnn_activation.cc
@@ -18,7 +18,7 @@
* under the License.
*
*************************************************************/
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include "singa/proto/core.pb.h"
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_softmax.cc b/test/singa/test_cudnn_softmax.cc
index 05783e2..e11be87 100644
--- a/test/singa/test_cudnn_softmax.cc
+++ b/test/singa/test_cudnn_softmax.cc
@@ -18,7 +18,7 @@
* under the License.
*
*************************************************************/
-#include "singa_config.h"
+#include "singa/singa_config.h"
#ifdef USE_CUDNN
#include "../src/model/layer/cudnn_softmax.h"
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index 5050d7e..052d0e8 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -20,7 +20,7 @@
*************************************************************/
#include "../src/model/layer/dense.h"
#include "gtest/gtest.h"
-#include "singa_config.h"
+#include "singa/singa_config.h"
using singa::Dense;
TEST(Dense, Setup) {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index ccaab7a..2c02273 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -23,7 +23,7 @@
#include "singa/core/tensor.h"
#include "singa/core/device.h"
#include "../src/model/loss/mse.h"
-#include "singa_config.h"
+#include "singa/singa_config.h"
using singa::Tensor;
class TestMSE : public ::testing::Test {
protected:
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_sgd.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
index c0b6e2b..71ab15e 100644
--- a/test/singa/test_sgd.cc
+++ b/test/singa/test_sgd.cc
@@ -21,7 +21,7 @@
#include "gtest/gtest.h"
#include "singa/model/optimizer.h"
-#include "singa_config.h"
+#include "singa/singa_config.h"
TEST(SGD, ApplyWithoutMomentum) {
singa::SGD sgd;
[5/6] incubator-singa git commit: Merge PR #165 for CnMeM
Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_softmax.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_softmax.cc
index e11be87,53ecb2b..d715b33
--- a/test/singa/test_cudnn_softmax.cc
+++ b/test/singa/test_cudnn_softmax.cc
@@@ -33,75 -35,133 +35,129 @@@ TEST(CudnnSoftmax, Setup)
singa::LayerConf conf;
singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
- softmaxconf->set_axis(2);
-
- sft.Setup(conf);
- sft.InitCudnn(1, singa::kFloat32);
- EXPECT_EQ(2, sft.Axis());
+ softmaxconf->set_algorithm("fast");
+ sft.Setup(Shape{1}, conf);
+ EXPECT_EQ(CUDNN_SOFTMAX_FAST, sft.Algorithm());
}
- TEST(CudnnSoftmax, Forward) {
- const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
+ TEST(CudnnSoftmax, Forward1D) {
+ const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+ singa::Shape shape = {n};
- singa::Tensor in(shape, &cuda);
++ singa::Tensor in(shape, cuda);
in.CopyDataFromHostPtr<float>(x, n);
- int axis = 1;
CudnnSoftmax sft;
singa::LayerConf conf;
singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
- softmaxconf->set_axis(axis);
- sft.Setup(conf);
- sft.InitCudnn(n, singa::kFloat32);
-
+ softmaxconf->set_algorithm("accurate");
+ sft.Setup(Shape{1}, conf);
singa::Tensor out = sft.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out.ToDevice(&host);
- const float* yptr = out.data<const float*>();
++ out.ToHost();
+ const float* yptr = out.data<float>();
EXPECT_EQ(n, out.Size());
float* y = new float[n];
float sigma = 0.f;
for (size_t i = 0; i < n; i++) sigma += exp(x[i]);
for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma;
- EXPECT_FLOAT_EQ(y[0], yptr[0]);
- EXPECT_FLOAT_EQ(y[4], yptr[4]);
- EXPECT_FLOAT_EQ(y[5], yptr[5]);
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
}
- TEST(CudnnSoftmax, Backward) {
- const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -1.0};
+ TEST(CudnnSoftmax, Backward1D) {
+ const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{n}, &cuda);
+ singa::Shape shape = {n};
- singa::Tensor in(shape, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(shape, cuda);
in.CopyDataFromHostPtr<float>(x, n);
- int axis = 1;
CudnnSoftmax sft;
singa::LayerConf conf;
singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
- softmaxconf->set_axis(axis);
- sft.Setup(conf);
+ softmaxconf->set_algorithm("accurate");
+ sft.Setup(Shape{1}, conf);
+
singa::Tensor out = sft.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out.ToDevice(&host);
- const float* yptr = out.data<const float*>();
++ out.ToHost();
+ const float* yptr = out.data<float>();
- const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
- singa::Tensor out_diff(singa::Shape{n}, &cuda);
+ const float grad[] = {2.f, -3.f, 1.f, 3.f, -1.f, -2.f};
- singa::Tensor out_diff(shape, &cuda);
++ singa::Tensor out_diff(shape, cuda);
out_diff.CopyDataFromHostPtr<float>(grad, n);
const auto ret = sft.Backward(singa::kTrain, out_diff);
singa::Tensor in_diff = ret.first;
-- in_diff.ToDevice(&host);
- const float* xptr = in_diff.data<const float*>();
++ in_diff.ToHost();
+ const float* xptr = in_diff.data<float>();
float* dx = new float[n];
float sigma = 0.f;
for (size_t i = 0; i < n; i++) sigma += grad[i] * yptr[i];
for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma) * yptr[i];
- EXPECT_FLOAT_EQ(dx[0], xptr[0]);
- EXPECT_FLOAT_EQ(dx[4], xptr[4]);
- EXPECT_FLOAT_EQ(dx[5], xptr[5]);
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+ }
+
+ TEST(CudnnSoftmax, Forward2D) {
+ const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
+ size_t n = sizeof(x) / sizeof(float);
+ size_t batch = 2, c = 3;
- singa::CudaGPU cuda(0, 1);
+ singa::Shape shape = {batch, c};
- singa::Tensor in(shape, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(shape, cuda);
+ in.CopyDataFromHostPtr<float>(x, n);
+
+ CudnnSoftmax sft;
+ singa::LayerConf conf;
+ singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+ softmaxconf->set_algorithm("accurate");
+ sft.Setup(Shape{c}, conf);
+
+ singa::Tensor out = sft.Forward(singa::kTrain, in);
- singa::CppCPU host(0, 1);
- out.ToDevice(&host);
++ out.ToHost();
+ const float* yptr = out.data<float>();
+ EXPECT_EQ(n, out.Size());
+
+ float* y = new float[n];
+ float* sigma = new float[batch];
+ for (size_t i = 0; i < batch; i++) sigma[i] = 0.f;
+ for (size_t i = 0; i < n; i++) sigma[i / c] += exp(x[i]);
+ for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma[i / c];
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+
+ TEST(CudnnSoftmax, Backward2D) {
+ const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
+ size_t n = sizeof(x) / sizeof(float);
+ size_t batch = 2, c = 3;
- singa::CudaGPU cuda(0, 1);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+ singa::Shape shape = {batch, c};
- singa::Tensor in(shape, &cuda);
++ singa::Tensor in(shape, cuda);
+ in.CopyDataFromHostPtr<float>(x, n);
+
+ CudnnSoftmax sft;
+ singa::LayerConf conf;
+ singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+ softmaxconf->set_algorithm("accurate");
+ sft.Setup(Shape{c}, conf);
+
+ singa::Tensor out = sft.Forward(singa::kTrain, in);
- singa::CppCPU host(0, 1);
- out.ToDevice(&host);
++ out.ToHost();
+ const float* yptr = out.data<float>();
+
+ const float grad[] = {2.f, -3.f, 1.f, 3.f, -1.f, -2.f};
- singa::Tensor out_diff(shape, &cuda);
++ singa::Tensor out_diff(shape, cuda);
+ out_diff.CopyDataFromHostPtr<float>(grad, n);
+ const auto ret = sft.Backward(singa::kTrain, out_diff);
+ singa::Tensor in_diff = ret.first;
- in_diff.ToDevice(&host);
++ in_diff.ToHost();
+ const float* xptr = in_diff.data<float>();
+
+ float* dx = new float[n];
+ float* sigma = new float[batch];
+ for (size_t i = 0; i < batch; i++) sigma[i] = 0.f;
+ for (size_t i = 0; i < n; i++) sigma[i / c] += grad[i] * yptr[i];
+ for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma[i / c]) * yptr[i];
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
}
#endif // USE_CUDNN
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --cc test/singa/test_dense.cc
index 7ed4d33,a5fd960..363fb6e
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@@ -1,242 -1,245 +1,238 @@@
--/************************************************************
--*
--* Licensed to the Apache Software Foundation (ASF) under one
--* or more contributor license agreements. See the NOTICE file
--* distributed with this work for additional information
--* regarding copyright ownership. The ASF licenses this file
--* to you under the Apache License, Version 2.0 (the
--* "License"); you may not use this file except in compliance
--* with the License. You may obtain a copy of the License at
--*
--* http://www.apache.org/licenses/LICENSE-2.0
--*
--* Unless required by applicable law or agreed to in writing,
--* software distributed under the License is distributed on an
--* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
--* KIND, either express or implied. See the License for the
--* specific language governing permissions and limitations
--* under the License.
--*
--*************************************************************/
--#include "../src/model/layer/dense.h"
--#include "gtest/gtest.h"
- #include "singa/singa_config.h"
-#include "singa_config.h"
--
--using singa::Dense;
-using singa::Shape;
--TEST(Dense, Setup) {
-- Dense dense;
-- EXPECT_EQ("Dense", dense.layer_type());
--
-- singa::LayerConf conf;
-- singa::DenseConf *denseconf = conf.mutable_dense_conf();
- denseconf->set_num_input(2);
-- denseconf->set_num_output(3);
-- denseconf->set_transpose(false);
- dense.Setup(conf);
- dense.Setup(Shape{2}, conf);
--
-- EXPECT_EQ(3u, dense.num_output());
-- EXPECT_EQ(2u, dense.num_input());
--}
--#ifdef USE_CBLAS
--TEST(Dense, ForwardCpp) {
-- Dense dense;
--
-- singa::LayerConf conf;
-- singa::DenseConf *denseconf = conf.mutable_dense_conf();
- denseconf->set_num_input(2);
-- denseconf->set_num_output(3);
-- denseconf->set_transpose(false);
- dense.Setup(conf);
- dense.Setup(Shape{2}, conf);
--
-- const size_t batchsize = 3, vdim = 2, hdim = 3;
-- const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-- singa::Tensor in(singa::Shape{batchsize, vdim});
-- in.CopyDataFromHostPtr(x, batchsize * vdim);
--
-- // set weight
-- const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-- singa::Tensor weight(singa::Shape{hdim, vdim});
-- weight.CopyDataFromHostPtr(we, hdim * vdim);
--
-- const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-- singa::Tensor bias(singa::Shape{hdim});
-- bias.CopyDataFromHostPtr(bia, hdim);
--
-- dense.set_weight(weight);
-- dense.set_bias(bias);
--
-- singa::Tensor out1 = dense.Forward(singa::kTrain, in);
- const float *outptr1 = out1.data<const float *>();
- singa::CppCPU host(0, 1);
- const float *outptr1 = out1.data<float>();
-- EXPECT_EQ(9u, out1.Size());
-- for (int i = 0; i < 3; i++)
-- for (int j = 0; j < 3; j++)
-- EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
-- x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
-- outptr1[i * 3 + j]);
--}
--#endif // USE_CBLAS
-#ifdef USE_CUDA
--TEST(Dense, BackwardCpp) {
-- Dense dense;
--
-- singa::LayerConf conf;
-- singa::DenseConf *denseconf = conf.mutable_dense_conf();
- denseconf->set_num_input(2);
-- denseconf->set_num_output(3);
-- denseconf->set_transpose(false);
- dense.Setup(conf);
- dense.Setup(Shape{2}, conf);
--
-- const size_t batchsize = 3, vdim = 2, hdim = 3;
-- const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, vdim});
-- in.CopyDataFromHostPtr(x, batchsize * vdim);
--
-- // set weight
-- const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-- singa::Tensor weight(singa::Shape{hdim, vdim});
-- weight.CopyDataFromHostPtr(we, hdim * vdim);
--
-- const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-- singa::Tensor bias(singa::Shape{hdim});
-- bias.CopyDataFromHostPtr(bia, hdim);
--
-- dense.set_weight(weight);
-- dense.set_bias(bias);
--
-- singa::Tensor out1 = dense.Forward(singa::kTrain, in);
--
-- // grad
-- const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
-- 2.0f, 3.0f, 3.0f, 3.0f};
-- singa::Tensor grad(singa::Shape{batchsize, hdim});
-- grad.CopyDataFromHostPtr(dy, batchsize * hdim);
--
-- const auto ret = dense.Backward(singa::kTrain, grad);
- singa::CppCPU host(0, 1);
-- singa::Tensor in_grad = ret.first;
-- singa::Tensor dweight = ret.second.at(0);
-- singa::Tensor dbias = ret.second.at(1);
- const float *dx = in_grad.data<const float *>();
- const float *dx = in_grad.data<float>();
-- EXPECT_EQ(6u, in_grad.Size());
-- for (int i = 0; i < 3; i++)
-- for (int j = 0; j < 2; j++)
-- EXPECT_FLOAT_EQ(
-- (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
-- dy[i * 3 + 2] * we[2 * 2 + j]),
-- dx[i * 2 + j]);
- const float *dweightx = dweight.data<const float *>();
- const float *dweightx = dweight.data<float>();
-- EXPECT_EQ(6u, dweight.Size());
-- for (int i = 0; i < 3; i++)
-- for (int j = 0; j < 2; j++)
-- EXPECT_FLOAT_EQ(
-- (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
-- dy[2 * 3 + i] * x[2 * 2 + j]),
-- dweightx[i * 2 + j]);
- const float *dbiasx = dbias.data<const float *>();
- const float *dbiasx = dbias.data<float>();
-- EXPECT_EQ(3u, dbias.Size());
-- for (int i = 0; i < 3; i++)
-- EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
--}
-#endif
--
--#ifdef USE_CUDA
--TEST(Dense, ForwardCuda) {
-- Dense dense;
--
-- singa::LayerConf conf;
-- singa::DenseConf *denseconf = conf.mutable_dense_conf();
- denseconf->set_num_input(2);
-- denseconf->set_num_output(3);
-- denseconf->set_transpose(false);
- dense.Setup(conf);
- dense.Setup(Shape{2}, conf);
--
-- const size_t batchsize = 3, vdim = 2, hdim = 3;
-- const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
- auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
- singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
-- in.CopyDataFromHostPtr(x, batchsize * vdim);
--
-- // set weight
-- const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
- singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
- singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
-- weight.CopyDataFromHostPtr(we, hdim * vdim);
--
-- const float bia[hdim] = {1.0f, 1.0f, 1.0f};
- singa::Tensor bias(singa::Shape{hdim}, cuda);
- singa::Tensor bias(singa::Shape{hdim}, &cuda);
-- bias.CopyDataFromHostPtr(bia, hdim);
--
-- dense.set_weight(weight);
-- dense.set_bias(bias);
--
-- singa::Tensor out1 = dense.Forward(singa::kTrain, in);
- out1.ToHost();
- const float *outptr1 = out1.data<const float *>();
- singa::CppCPU host(0, 1);
- out1.ToDevice(&host);
- const float *outptr1 = out1.data<float>();
-- EXPECT_EQ(9u, out1.Size());
-- for (int i = 0; i < 3; i++)
-- for (int j = 0; j < 3; j++)
-- EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
-- x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
-- outptr1[i * 3 + j]);
--}
--TEST(Dense, BackwardCuda) {
-- Dense dense;
--
-- singa::LayerConf conf;
-- singa::DenseConf *denseconf = conf.mutable_dense_conf();
- denseconf->set_num_input(2);
-- denseconf->set_num_output(3);
-- denseconf->set_transpose(false);
- dense.Setup(conf);
- dense.Setup(Shape{2}, conf);
--
-- const size_t batchsize = 3, vdim = 2, hdim = 3;
-- const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
- auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
- singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
-- in.CopyDataFromHostPtr(x, batchsize * vdim);
--
-- // set weight
-- const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
- singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
- singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
-- weight.CopyDataFromHostPtr(we, hdim * vdim);
--
-- const float bia[hdim] = {1.0f, 1.0f, 1.0f};
- singa::Tensor bias(singa::Shape{hdim}, cuda);
- singa::Tensor bias(singa::Shape{hdim}, &cuda);
-- bias.CopyDataFromHostPtr(bia, hdim);
--
-- dense.set_weight(weight);
-- dense.set_bias(bias);
--
-- singa::Tensor out1 = dense.Forward(singa::kTrain, in);
--
-- // grad
-- const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
-- 2.0f, 3.0f, 3.0f, 3.0f};
- singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
- singa::Tensor grad(singa::Shape{batchsize, hdim}, &cuda);
-- grad.CopyDataFromHostPtr(dy, batchsize * hdim);
--
-- const auto ret = dense.Backward(singa::kTrain, grad);
- singa::CppCPU host(0, 1);
-- singa::Tensor in_grad = ret.first;
-- singa::Tensor dweight = ret.second.at(0);
-- singa::Tensor dbias = ret.second.at(1);
- in_grad.ToHost();
- const float *dx = in_grad.data<const float *>();
- in_grad.ToDevice(&host);
- const float *dx = in_grad.data<float>();
-- EXPECT_EQ(6u, in_grad.Size());
-- for (int i = 0; i < 3; i++)
-- for (int j = 0; j < 2; j++)
-- EXPECT_FLOAT_EQ(
-- (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
-- dy[i * 3 + 2] * we[2 * 2 + j]),
-- dx[i * 2 + j]);
- dweight.ToHost();
- const float *dweightx = dweight.data<const float *>();
- dweight.ToDevice(&host);
- const float *dweightx = dweight.data<float>();
-- EXPECT_EQ(6u, dweight.Size());
-- for (int i = 0; i < 3; i++)
-- for (int j = 0; j < 2; j++)
-- EXPECT_FLOAT_EQ(
-- (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
-- dy[2 * 3 + i] * x[2 * 2 + j]),
-- dweightx[i * 2 + j]);
- dbias.ToHost();
- const float *dbiasx = dbias.data<const float *>();
- dbias.ToDevice(&host);
- const float *dbiasx = dbias.data<float>();
-- EXPECT_EQ(3u, dbias.Size());
-- for (int i = 0; i < 3; i++)
-- EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
--}
--#endif
++/************************************************************
++*
++* Licensed to the Apache Software Foundation (ASF) under one
++* or more contributor license agreements. See the NOTICE file
++* distributed with this work for additional information
++* regarding copyright ownership. The ASF licenses this file
++* to you under the Apache License, Version 2.0 (the
++* "License"); you may not use this file except in compliance
++* with the License. You may obtain a copy of the License at
++*
++* http://www.apache.org/licenses/LICENSE-2.0
++*
++* Unless required by applicable law or agreed to in writing,
++* software distributed under the License is distributed on an
++* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++* KIND, either express or implied. See the License for the
++* specific language governing permissions and limitations
++* under the License.
++*
++*************************************************************/
++#include "../src/model/layer/dense.h"
++#include "gtest/gtest.h"
++#include "singa/singa_config.h"
++
++using singa::Dense;
++using singa::Shape;
++TEST(Dense, Setup) {
++ Dense dense;
++ EXPECT_EQ("Dense", dense.layer_type());
++
++ singa::LayerConf conf;
++ singa::DenseConf *denseconf = conf.mutable_dense_conf();
++ denseconf->set_num_output(3);
++ denseconf->set_transpose(false);
++ dense.Setup(Shape{2}, conf);
++
++ EXPECT_EQ(3u, dense.num_output());
++ EXPECT_EQ(2u, dense.num_input());
++}
++#ifdef USE_CBLAS
++TEST(Dense, ForwardCpp) {
++ Dense dense;
++
++ singa::LayerConf conf;
++ singa::DenseConf *denseconf = conf.mutable_dense_conf();
++ denseconf->set_num_output(3);
++ denseconf->set_transpose(false);
++ dense.Setup(Shape{2}, conf);
++
++ const size_t batchsize = 3, vdim = 2, hdim = 3;
++ const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++ singa::Tensor in(singa::Shape{batchsize, vdim});
++ in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++ // set weight
++ const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++ singa::Tensor weight(singa::Shape{hdim, vdim});
++ weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++ const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++ singa::Tensor bias(singa::Shape{hdim});
++ bias.CopyDataFromHostPtr(bia, hdim);
++
++ dense.set_weight(weight);
++ dense.set_bias(bias);
++
++ singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++ const float *outptr1 = out1.data<float>();
++ EXPECT_EQ(9u, out1.Size());
++ for (int i = 0; i < 3; i++)
++ for (int j = 0; j < 3; j++)
++ EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
++ x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
++ outptr1[i * 3 + j]);
++}
++TEST(Dense, BackwardCpp) {
++ Dense dense;
++
++ singa::LayerConf conf;
++ singa::DenseConf *denseconf = conf.mutable_dense_conf();
++ denseconf->set_num_output(3);
++ denseconf->set_transpose(false);
++ dense.Setup(Shape{2}, conf);
++
++ const size_t batchsize = 3, vdim = 2, hdim = 3;
++ const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++ singa::Tensor in(singa::Shape{batchsize, vdim});
++ in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++ // set weight
++ const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++ singa::Tensor weight(singa::Shape{hdim, vdim});
++ weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++ const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++ singa::Tensor bias(singa::Shape{hdim});
++ bias.CopyDataFromHostPtr(bia, hdim);
++
++ dense.set_weight(weight);
++ dense.set_bias(bias);
++
++ singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++
++ // grad
++ const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
++ 2.0f, 3.0f, 3.0f, 3.0f};
++ singa::Tensor grad(singa::Shape{batchsize, hdim});
++ grad.CopyDataFromHostPtr(dy, batchsize * hdim);
++
++ const auto ret = dense.Backward(singa::kTrain, grad);
++ singa::Tensor in_grad = ret.first;
++ singa::Tensor dweight = ret.second.at(0);
++ singa::Tensor dbias = ret.second.at(1);
++ const float *dx = in_grad.data<float>();
++ EXPECT_EQ(6u, in_grad.Size());
++ for (int i = 0; i < 3; i++)
++ for (int j = 0; j < 2; j++)
++ EXPECT_FLOAT_EQ(
++ (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
++ dy[i * 3 + 2] * we[2 * 2 + j]),
++ dx[i * 2 + j]);
++ const float *dweightx = dweight.data<float>();
++ EXPECT_EQ(6u, dweight.Size());
++ for (int i = 0; i < 3; i++)
++ for (int j = 0; j < 2; j++)
++ EXPECT_FLOAT_EQ(
++ (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
++ dy[2 * 3 + i] * x[2 * 2 + j]),
++ dweightx[i * 2 + j]);
++ const float *dbiasx = dbias.data<float>();
++ EXPECT_EQ(3u, dbias.Size());
++ for (int i = 0; i < 3; i++)
++ EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
++}
++#endif // USE_CBLAS
++
++#ifdef USE_CUDA
++TEST(Dense, ForwardCuda) {
++ Dense dense;
++
++ singa::LayerConf conf;
++ singa::DenseConf *denseconf = conf.mutable_dense_conf();
++ denseconf->set_num_output(3);
++ denseconf->set_transpose(false);
++ dense.Setup(Shape{2}, conf);
++
++ const size_t batchsize = 3, vdim = 2, hdim = 3;
++ const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
++ in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++ // set weight
++ const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++ singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
++ weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++ const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++ singa::Tensor bias(singa::Shape{hdim}, cuda);
++ bias.CopyDataFromHostPtr(bia, hdim);
++
++ dense.set_weight(weight);
++ dense.set_bias(bias);
++
++ singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++ out1.ToHost();
++ const float *outptr1 = out1.data<float>();
++ EXPECT_EQ(9u, out1.Size());
++ for (int i = 0; i < 3; i++)
++ for (int j = 0; j < 3; j++)
++ EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
++ x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
++ outptr1[i * 3 + j]);
++}
++TEST(Dense, BackwardCuda) {
++ Dense dense;
++
++ singa::LayerConf conf;
++ singa::DenseConf *denseconf = conf.mutable_dense_conf();
++ denseconf->set_num_output(3);
++ denseconf->set_transpose(false);
++ dense.Setup(Shape{2}, conf);
++
++ const size_t batchsize = 3, vdim = 2, hdim = 3;
++ const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
++ in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++ // set weight
++ const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++ singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
++ weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++ const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++ singa::Tensor bias(singa::Shape{hdim}, cuda);
++ bias.CopyDataFromHostPtr(bia, hdim);
++
++ dense.set_weight(weight);
++ dense.set_bias(bias);
++
++ singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++
++ // grad
++ const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
++ 2.0f, 3.0f, 3.0f, 3.0f};
++ singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
++ grad.CopyDataFromHostPtr(dy, batchsize * hdim);
++
++ const auto ret = dense.Backward(singa::kTrain, grad);
++ singa::Tensor in_grad = ret.first;
++ singa::Tensor dweight = ret.second.at(0);
++ singa::Tensor dbias = ret.second.at(1);
++ in_grad.ToHost();
++ const float *dx = in_grad.data<float>();
++ EXPECT_EQ(6u, in_grad.Size());
++ for (int i = 0; i < 3; i++)
++ for (int j = 0; j < 2; j++)
++ EXPECT_FLOAT_EQ(
++ (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
++ dy[i * 3 + 2] * we[2 * 2 + j]),
++ dx[i * 2 + j]);
++ dweight.ToHost();
++ const float *dweightx = dweight.data<float>();
++ EXPECT_EQ(6u, dweight.Size());
++ for (int i = 0; i < 3; i++)
++ for (int j = 0; j < 2; j++)
++ EXPECT_FLOAT_EQ(
++ (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
++ dy[2 * 3 + i] * x[2 * 2 + j]),
++ dweightx[i * 2 + j]);
++ dbias.ToHost();
++ const float *dbiasx = dbias.data<float>();
++ EXPECT_EQ(3u, dbias.Size());
++ for (int i = 0; i < 3; i++)
++ EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
++}
++#endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_flatten.cc
----------------------------------------------------------------------
diff --cc test/singa/test_flatten.cc
index 0000000,2a77272..25e00c4
mode 000000,100644..100644
--- a/test/singa/test_flatten.cc
+++ b/test/singa/test_flatten.cc
@@@ -1,0 -1,145 +1,143 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "../src/model/layer/flatten.h"
+ #include "gtest/gtest.h"
+
+ using singa::Flatten;
+ using singa::Shape;
+ TEST(Flatten, Setup) {
+ Flatten flt;
+ EXPECT_EQ("Flatten", flt.layer_type());
+
+ singa::LayerConf conf;
+ singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+ flattenconf->set_axis(1);
+
+ flt.Setup(Shape{2}, conf);
+ EXPECT_EQ(1, flt.Axis());
+ }
+
+ TEST(Flatten, ForwardCPU) {
+ const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -4.f,
+ 1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+ size_t n = sizeof(x) / sizeof(float);
+ singa::Shape s = {2, 1, 3, 2};
+ singa::Tensor in(s);
+ in.CopyDataFromHostPtr<float>(x, n);
+
+ int axis = 3;
+ Flatten flt;
+ singa::LayerConf conf;
+ singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+ flattenconf->set_axis(axis);
+ flt.Setup(Shape{1, 3, 2}, conf);
+
+ singa::Tensor out = flt.Forward(singa::kTrain, in);
+ EXPECT_EQ(n, out.Size());
+ EXPECT_EQ(6u, out.shape(0));
+ EXPECT_EQ(2u, out.shape(1));
+ const float *yptr = out.data<float>();
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
+ }
+
+ TEST(Flatten, BackwardCPU) {
+ // directly use input as the output_grad for backward
+ // note that only the shape of input really matters
+ const float dy[] = {1.f, 2.f, 3.f, -2.f, -3.f, -4.f,
+ 1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+ size_t n = sizeof(dy) / sizeof(float);
+ singa::Tensor in(singa::Shape{2, 1, 3, 2});
+ in.CopyDataFromHostPtr<float>(dy, n);
+
+ int axis = 2;
+ Flatten flt;
+ singa::LayerConf conf;
+ singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+ flattenconf->set_axis(axis);
+ flt.Setup(Shape{1, 3, 2}, conf);
+
+ singa::Tensor temp = flt.Forward(singa::kTrain, in);
+ const auto out = flt.Backward(singa::kTrain, temp);
+ const float *xptr = out.first.data<float>();
+ EXPECT_EQ(n, out.first.Size());
+ EXPECT_EQ(2u, out.first.shape(0));
+ EXPECT_EQ(1u, out.first.shape(1));
+ EXPECT_EQ(3u, out.first.shape(2));
+ EXPECT_EQ(2u, out.first.shape(3));
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+ }
+
+ #ifdef USE_CUDA
+ TEST(Flatten, ForwardGPU) {
+ const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -4.f,
+ 1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+ size_t n = sizeof(x) / sizeof(float);
- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{2, 1, 3, 2}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>();
++ singa::Tensor in(singa::Shape{2, 1, 3, 2}, cuda);
+ in.CopyDataFromHostPtr<float>(x, n);
+
+ int axis = 3;
+ Flatten flt;
+ singa::LayerConf conf;
+ singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+ flattenconf->set_axis(axis);
+ flt.Setup(Shape{1, 3, 2}, conf);
+
+ singa::Tensor out = flt.Forward(singa::kTrain, in);
- singa::CppCPU host(0, 1);
- out.ToDevice(&host);
++ out.ToHost();
+ EXPECT_EQ(n, out.Size());
+ EXPECT_EQ(6u, out.shape(0));
+ EXPECT_EQ(2u, out.shape(1));
+ const float *yptr = out.data<float>();
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
+ }
+
+ TEST(Flatten, BackwardGPU) {
+ // directly use input as the output_grad for backward
+ // note that only the shape of input really matters
+ const float dy[] = {1.f, 2.f, 3.f, -2.f, -3.f, -4.f,
+ 1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+ size_t n = sizeof(dy) / sizeof(float);
- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{2, 1, 3, 2}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>();
++ singa::Tensor in(singa::Shape{2, 1, 3, 2}, cuda);
+ in.CopyDataFromHostPtr<float>(dy, n);
+
+ int axis = 2;
+ Flatten flt;
+ singa::LayerConf conf;
+ singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+ flattenconf->set_axis(axis);
+ flt.Setup(Shape{1, 3, 2}, conf);
+
+ singa::Tensor out = flt.Forward(singa::kTrain, in);
+ const auto ret = flt.Backward(singa::kTrain, out);
- singa::CppCPU host(0, 1);
+ singa::Tensor in_diff = ret.first;
- in_diff.ToDevice(&host);
++ in_diff.ToHost();
+ const float *xptr = in_diff.data<float>();
+ EXPECT_EQ(n, in_diff.Size());
+ EXPECT_EQ(2u, in_diff.shape(0));
+ EXPECT_EQ(1u, in_diff.shape(1));
+ EXPECT_EQ(3u, in_diff.shape(2));
+ EXPECT_EQ(2u, in_diff.shape(3));
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+ }
+ #endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_initializer.cc
----------------------------------------------------------------------
diff --cc test/singa/test_initializer.cc
index 0000000,e99cd79..4631af2
mode 000000,100644..100644
--- a/test/singa/test_initializer.cc
+++ b/test/singa/test_initializer.cc
@@@ -1,0 -1,148 +1,148 @@@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #include "singa/model/initializer.h"
+ #include "gtest/gtest.h"
+
+ TEST(Initializer, Constant) {
+ singa::init::Constant x;
+ size_t n = 10;
+ singa::Tensor t(singa::Shape{n});
+ singa::FillerConf conf;
+ conf.set_value(3.1f);
+ x.Setup(conf);
+ x.Fill(&t);
+ const float* xPtr = t.data<float>();
+ for (size_t i = 0; i < n; i++)
+ EXPECT_FLOAT_EQ(xPtr[i], 3.1f);
+ }
+
+
+ TEST(Initializer, Gaussian) {
+ singa::init::Gaussian x;
+ size_t n = 1000;
+ singa::Tensor t(singa::Shape{n});
+ singa::FillerConf conf;
+ conf.set_mean(0.11f);
+ conf.set_std(0.01f);
+ x.Setup(conf);
+ x.Fill(&t);
+ const float* xPtr = t.data<float>();
+ float mean = 0.0f, std = 0.0f;
+ for (size_t i = 0; i < n; i++)
+ mean += xPtr[i];
+ mean /= n;
+ EXPECT_NEAR(mean, 0.11f, 1e-3);
+ for (size_t i = 0; i < n; i++)
+ std += (xPtr[i] - mean) * (xPtr[i] - mean);
+ std /= n;
+ std = sqrt(std);
+ EXPECT_NEAR(std, 0.01f, 1e-3);
+ }
+
+ #ifdef USE_CUDA
+ TEST(Initializer, ConstantCUDA) {
+ singa::init::Constant x;
- singa::CudaGPU dev;
++ auto dev = std::make_shared<singa::CudaGPU>();
+ size_t n = 10;
- singa::Tensor t(singa::Shape{n}, &dev);
++ singa::Tensor t(singa::Shape{n}, dev);
+ singa::FillerConf conf;
+ conf.set_value(3.1f);
+ x.Setup(conf);
+ x.Fill(&t);
+ t.ToHost();
+ const float* xPtr = t.data<float>();
+ for (size_t i = 0; i < n; i++)
+ EXPECT_FLOAT_EQ(xPtr[i], 3.1f);
+
+
+ singa::init::Constant y(-0.1f);
- singa::Tensor s(singa::Shape{n}, &dev);
++ singa::Tensor s(singa::Shape{n}, dev);
+ y.Fill(&s);
+ s.ToHost();
+ const float* sPtr = s.data<float>();
+ for (size_t i = 0; i < n; i++)
+ EXPECT_FLOAT_EQ(sPtr[i], -0.1f);
+ }
+
+
+ TEST(Initializer, GaussianCUDA) {
+ singa::init::Gaussian x;
- singa::CudaGPU dev;
++ auto dev = std::make_shared<singa::CudaGPU>();
+ size_t n = 1000;
- singa::Tensor t(singa::Shape{n}, &dev);
++ singa::Tensor t(singa::Shape{n}, dev);
+ singa::FillerConf conf;
+ conf.set_mean(0.11f);
+ conf.set_std(0.01f);
+ x.Setup(conf);
+ x.Fill(&t);
+ t.ToHost();
+ const float* tPtr = t.data<float>();
+ float mean = 0.0f, std = 0.0f;
+ for (size_t i = 0; i < n; i++)
+ mean += tPtr[i];
+ mean /= n;
+ EXPECT_NEAR(mean, 0.11f, 1e-2);
+ for (size_t i = 0; i < n; i++)
+ std += (tPtr[i] - mean) * (tPtr[i] - mean);
+ std /= n;
+ std = sqrt(std);
+ EXPECT_NEAR(std, 0.01f, 1e-2);
+
+
+ singa::init::Gaussian y(1.5f, 0.1f);
- singa::Tensor s(singa::Shape{n}, &dev);
++ singa::Tensor s(singa::Shape{n}, dev);
+ y.Fill(&s);
+ s.ToHost();
+ const float* sPtr = s.data<float>();
+ for (size_t i = 0; i < n; i++)
+ mean += sPtr[i];
+ mean /= n;
+ EXPECT_NEAR(mean, 1.5f, 0.1f);
+ for (size_t i = 0; i < n; i++)
+ std += (sPtr[i] - mean) * (sPtr[i] - mean);
+ std /= n;
+ std = sqrt(std);
+ EXPECT_NEAR(std, 0.1f, 0.1f);
+ }
+
+ TEST(Initializer, XavierCUDA) {
+ singa::init::Constant x;
- singa::CudaGPU dev;
++ auto dev = std::make_shared<singa::CudaGPU>();
+ size_t m = 30, n=40;
- singa::Tensor t(singa::Shape{m, n}, &dev);
++ singa::Tensor t(singa::Shape{m, n}, dev);
+ x.Fill(&t);
+ t.ToHost();
+ const float* xPtr = t.data<float>();
+ float mean = 0.0f;
+ float high = -100.0f, low = 100.0f;
+ for (size_t i = 0; i < n; i++) {
+ mean += xPtr[i];
+ if (high < xPtr[i])
+ high = xPtr[i];
+ if (low > xPtr[i])
+ low = xPtr[i];
+ }
+ mean /= m * n;
+ EXPECT_NEAR(mean, 0, 1e-2);
+ float scale = sqrt(6.0f / (m + n));
+ EXPECT_LT(high, scale);
+ EXPECT_GT(low, -scale);
+ }
+
+ #endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --cc test/singa/test_memory.cc
index 90fc99a,0000000..b0df226
mode 100644,000000..100644
--- a/test/singa/test_memory.cc
+++ b/test/singa/test_memory.cc
@@@ -1,111 -1,0 +1,104 @@@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
- *
++*
+* http://www.apache.org/licenses/LICENSE-2.0
- *
++*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/utils/logging.h"
+#include "singa/core/memory.h"
+#include "singa/singa_config.h"
++#include "singa/utils/timer.h"
+#include <sys/time.h>
+
+#ifdef USE_CUDA
+TEST(CnmemPool, PoolInit) {
- singa::CnMemPool pool;
- pool.InitPool();
++ singa::CnMemPool pool;
++ pool.InitPool();
+}
+
+TEST(CnmemPool, PoolInitAll) {
- singa::CnMemPool pool;
- int nDevices;
- cudaGetDeviceCount(&nDevices);
- CHECK_GE(nDevices,1);
- pool.InitPool(nDevices,1000000U,0);
++ singa::CnMemPool pool;
++ int nDevices;
++ cudaGetDeviceCount(&nDevices);
++ CHECK_GE(nDevices, 1);
++ pool.InitPool(nDevices, 32, 0);
+}
+
+TEST(CnmemPool, UsePool) {
- singa::CnMemPool pool;
- pool.InitPool();
- int numOfTests = 10;
- int numOfWriteVsRead = 3;
- int allocSize = 1000000U;
- for(int i = 0; i < numOfTests; i++) {
- int** memPtrs = new int*[numOfWriteVsRead];
- for(int j = 0; j < numOfWriteVsRead; j++) {
- pool.Malloc((void**)(&memPtrs[j]), allocSize);
- }
- pool.Free(memPtrs[0]);
- delete[] memPtrs;
- }
++ singa::CnMemPool pool;
++ pool.InitPool();
++ int numOfTests = 10;
++ int numOfWriteVsRead = 3;
++ int allocSize = 32;
++ for (int i = 0; i < numOfTests; i++) {
++ int** memPtrs = new int* [numOfWriteVsRead];
++ for (int j = 0; j < numOfWriteVsRead; j++) {
++ pool.Malloc((void**)(&memPtrs[j]), allocSize);
++ }
++ pool.Free(memPtrs[0]);
++ delete[] memPtrs;
++ }
+}
+
+TEST(CudaMemPool, UsePool) {
- singa::CudaMemPool pool;
- int numOfTests = 10;
- int numOfWriteVsRead = 3;
- int allocSize = 1000000U;
- for(int i = 0; i < numOfTests; i++) {
- int** memPtrs = new int*[numOfWriteVsRead];
- for(int j = 0; j < numOfWriteVsRead; j++) {
- pool.Malloc((void**)(&memPtrs[j]), allocSize);
- }
- pool.Free(memPtrs[0]);
- delete[] memPtrs;
- }
++ singa::CudaMemPool pool;
++ int numOfTests = 10;
++ int numOfWriteVsRead = 3;
++ int allocSize = 32;
++ for (int i = 0; i < numOfTests; i++) {
++ int** memPtrs = new int* [numOfWriteVsRead];
++ for (int j = 0; j < numOfWriteVsRead; j++) {
++ pool.Malloc((void**)(&memPtrs[j]), allocSize);
++ }
++ pool.Free(memPtrs[0]);
++ delete[] memPtrs;
++ }
+}
+
+TEST(MemPool, CompareCudaCnmem) {
- singa::CudaMemPool cudaPool;
- singa::CnMemPool cnPool;
- cnPool.InitPool();
++ singa::CudaMemPool cudaPool;
++ singa::CnMemPool cnPool;
++ cnPool.InitPool();
++
++ int numOfTests = 5000;
++ int allocSize = 32;
+
- int numOfTests = 5000;
- int allocSize = 1000000U;
- struct timeval start,end;
- double t1,t2;
++ singa::DeviceMemPool* pool = NULL;
++ pool = &cnPool;
+
- singa::DeviceMemPool* pool = NULL;
- pool = &cnPool;
-
- gettimeofday(&start,NULL);
- for(int i = 0; i < numOfTests; i++) {
- int* memPtrs = NULL;
- pool->Malloc((void**)&memPtrs, allocSize);
- pool->Free(memPtrs);
- }
- gettimeofday(&end,NULL);
-
- t1 = start.tv_sec * 1000 + start.tv_usec/1000;
- t2 = end.tv_sec * 1000 + end.tv_usec/1000;
- LOG(INFO) << "cnmem memory time: " << t2-t1 << " ms" << std::endl;
++ singa::Timer tick;
++ for (int i = 0; i < numOfTests; i++) {
++ int* memPtrs = NULL;
++ pool->Malloc((void**)&memPtrs, allocSize);
++ pool->Free(memPtrs);
++ }
++ tick.Tick();
++ int cn_time = tick.Elapsed();
+
- pool = &cudaPool;
- gettimeofday(&start,NULL);
- for(int i = 0; i < numOfTests; i++) {
- int* memPtrs = NULL;
- pool->Malloc((void**)&memPtrs, allocSize);
- pool->Free(memPtrs);
- }
- gettimeofday(&end,NULL);
-
- t1 = start.tv_sec * 1000 + start.tv_usec/1000;
- t2 = end.tv_sec * 1000 + end.tv_usec/1000;
- LOG(INFO) << "cuda memory time: " << t2-t1 << " ms" << std::endl;
++ pool = &cudaPool;
++ for (int i = 0; i < numOfTests; i++) {
++ int* memPtrs = NULL;
++ pool->Malloc((void**)&memPtrs, allocSize);
++ pool->Free(memPtrs);
++ }
++ tick.Tick();
++ int cuda_time = tick.Elapsed();
++ EXPECT_GE(cuda_time, cn_time);
+}
- #endif // USE_CUDA
++#endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --cc test/singa/test_mse.cc
index d2c5125,928be9d..788652f
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@@ -22,8 -22,9 +22,8 @@@
#include "gtest/gtest.h"
#include "singa/core/tensor.h"
#include "singa/core/device.h"
- #include "../src/model/loss/mse.h"
- #include "singa/singa_config.h"
+ #include "singa/model/loss.h"
-#include "singa_config.h"
+
using singa::Tensor;
class TestMSE : public ::testing::Test {
protected:
@@@ -68,14 -69,14 +68,14 @@@ TEST_F(TestMSE, CppBackward)
#endif
#ifdef USE_CUDA
TEST_F(TestMSE, CudaForward) {
- singa::MSE mse;
- singa::CudaGPU dev;
- p.ToDevice(&dev);
- t.ToDevice(&dev);
- Tensor loss = mse.Forward(p, t);
+ singa::MSE* mse = new singa::MSE();
+ auto dev = std::make_shared<singa::CudaGPU>();
+ p.ToDevice(dev);
+ t.ToDevice(dev);
+ Tensor loss = mse->Forward(p, t);
loss.ToHost();
- auto ldat = loss.data<const float*>();
+ auto ldat = loss.data<float>();
for (size_t i = 0, k = 0; i < loss.Size(); i++) {
float l = 0.f;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_nesterov.cc
----------------------------------------------------------------------
diff --cc test/singa/test_nesterov.cc
index 0000000,35b2b4d..73f69f4
mode 000000,100644..100644
--- a/test/singa/test_nesterov.cc
+++ b/test/singa/test_nesterov.cc
@@@ -1,0 -1,101 +1,101 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
-#include "singa_config.h"
++#include "singa/singa_config.h"
+
+ TEST(Nesterov, ApplyCPU) {
+ singa::Nesterov nesterov;
+ float lr = 0.1f;
+ auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+ nesterov.SetMomentumGenerator(func);
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ nesterov.Apply(0, lr, "xx", grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ const float* newv1 = v1.data<float>();
+ float history[4], tmp[4];
+ for (int i = 0; i < 4; ++i) {
+ history[i] = g[i] * lr;
+ tmp[i] = history[i] * (1 + func(0));
+ }
+ for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ nesterov.Apply(1, lr, "xx", grad, &value);
+ singa::Tensor v2 = value.Clone();
+ const float* newv2 = v2.data<float>();
+ for (int i = 0; i < 4; ++i) {
+ tmp[i] = history[i];
+ history[i] = history[i] * func(1) + g[i] * lr;
+ tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+ }
+
+ for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+ }
+
+ #ifdef USE_CUDA
+ TEST(Nesterov, ApplyCUDA) {
+ singa::Nesterov nesterov;
+ float lr = 0.1f;
+ auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+ nesterov.SetMomentumGenerator(func);
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
- singa::CudaGPU dev;
- singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ nesterov.Apply(0, lr, "xx", grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ v1.ToHost();
+ const float* newv1 = v1.data<float>();
+ float history[4], tmp[4];
+ for (int i = 0; i < 4; ++i) {
+ history[i] = g[i] * lr;
+ tmp[i] = history[i] * (1 + func(0));
+ }
+ for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ nesterov.Apply(1, lr, "xx", grad, &value);
+ singa::Tensor v2 = value.Clone();
+ v2.ToHost();
+ const float* newv2 = v2.data<float>();
+ for (int i = 0; i < 4; ++i) {
+ tmp[i] = history[i];
+ history[i] = history[i] * func(1) + g[i] * lr;
+ tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+ }
+
+ for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+ }
+ #endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_prelu.cc
----------------------------------------------------------------------
diff --cc test/singa/test_prelu.cc
index 0000000,fee7c5b..dbf5ca6
mode 000000,100644..100644
--- a/test/singa/test_prelu.cc
+++ b/test/singa/test_prelu.cc
@@@ -1,0 -1,247 +1,245 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "../src/model/layer/prelu.h"
+ #include "gtest/gtest.h"
-#include "singa_config.h"
++#include "singa/singa_config.h"
+
+ using singa::PReLU;
+ using singa::Shape;
+ TEST(PReLU, Setup) {
+ PReLU prelu;
+ EXPECT_EQ("PReLU", prelu.layer_type());
+
+ singa::LayerConf conf;
+ singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+ preluconf->set_channel_shared(true);
+ preluconf->set_format("NHWC");
+
+ prelu.Setup(Shape{4}, conf);
+ EXPECT_EQ(true, prelu.Channel_shared());
+ EXPECT_EQ("NHWC", prelu.Format());
+ }
+
+ TEST(PReLU, ForwardCPU) {
+ const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f,
+ -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+ size_t n = sizeof(x) / sizeof(float);
+ size_t batchsize = 2, c = 3, h = 2, w = 1;
+ singa::Tensor in(singa::Shape{batchsize, h, w, c});
+ in.CopyDataFromHostPtr<float>(x, n);
+
+ PReLU prelu;
+ singa::LayerConf conf;
+ singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+ preluconf->set_channel_shared(false);
+ preluconf->set_format("NHWC");
+ prelu.Setup(Shape{h, w, c}, conf);
+
+ const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+ singa::Tensor a(singa::Shape{c});
+ a.CopyDataFromHostPtr<float>(neg_slope, c);
+ prelu.Set_a(a);
+
+ singa::Tensor out = prelu.Forward(singa::kTrain, in);
+ const float *yptr = out.data<float>();
+ EXPECT_EQ(n, out.Size());
+
+ float *y = new float[n];
+ size_t div_factor = prelu.Channel_shared() ? c : 1;
+ if (prelu.Format() == "NCHW") {
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i / (h * w) % c / div_factor;
+ y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+ }
+ } else if (prelu.Format() == "NHWC") {
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i % c / div_factor;
+ y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+ }
+ }
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+
+ TEST(PReLU, BackwardCPU) {
+ const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f,
+ -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+ size_t n = sizeof(x) / sizeof(float);
+ size_t batchsize = 2, c = 3, h = 2, w = 1;
+ singa::Tensor in(singa::Shape{batchsize, c, h, w});
+ in.CopyDataFromHostPtr<float>(x, n);
+
+ PReLU prelu;
+ singa::LayerConf conf;
+ singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+ preluconf->set_channel_shared(false);
+ preluconf->set_format("NCHW");
+ prelu.Setup(Shape{c, h, w}, conf);
+
+ const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+ singa::Tensor a(singa::Shape{c});
+ a.CopyDataFromHostPtr<float>(neg_slope, c);
+ prelu.Set_a(a);
+
+ singa::Tensor out = prelu.Forward(singa::kTrain, in);
+
+ const float grad[] = {1.f, 2.f, -2.f, -1.f, -1.f, -3.f,
+ 2.f, -2.f, 1.f, 1.f, -2.f, 0.f};
+ singa::Tensor out_diff(singa::Shape{batchsize, c, h, w});
+ out_diff.CopyDataFromHostPtr<float>(grad, n);
+ const auto ret = prelu.Backward(singa::kTrain, out_diff);
+ const float *xptr = ret.first.data<float>();
+ const float *aptr = ret.second.at(0).data<float>();
+ float *dx = new float[n];
+ size_t div_factor = prelu.Channel_shared() ? c : 1;
+ size_t params = prelu.Channel_shared() ? 1 : c;
+ float da[] = {0.f, 0.f, 0.f};
+ if (prelu.Format() == "NCHW") {
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i / (h * w) % c / div_factor;
+ dx[i] = grad[i] *
+ (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+ }
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i / (h * w) % c / div_factor;
+ da[pos] += grad[i] * std::min(x[i], 0.f);
+ }
+ } else if (prelu.Format() == "NHWC") {
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i % c / div_factor;
+ dx[i] = grad[i] *
+ (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+ }
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i % c / div_factor;
+ da[pos] += grad[i] * std::min(x[i], 0.f);
+ }
+ }
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+ for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
+ }
+
+ #ifdef USE_CUDA
+ TEST(PReLU, ForwardGPU) {
+ const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f,
+ -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+ size_t n = sizeof(x) / sizeof(float);
+ size_t batchsize = 2, c = 3, h = 2, w = 1;
- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{batchsize, h, w, c}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>();
++ singa::Tensor in(singa::Shape{batchsize, h, w, c}, cuda);
+ in.CopyDataFromHostPtr<float>(x, n);
+
+ PReLU prelu;
+ singa::LayerConf conf;
+ singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+ preluconf->set_channel_shared(false);
+ preluconf->set_format("NHWC");
+ prelu.Setup(Shape{h, w, c}, conf);
+
+ const float neg_slope[] = {0.25f, 0.5f, 0.75f};
- singa::Tensor a(singa::Shape{c}, &cuda);
++ singa::Tensor a(singa::Shape{c}, cuda);
+ a.CopyDataFromHostPtr<float>(neg_slope, c);
+ prelu.Set_a(a);
+
+ singa::Tensor out = prelu.Forward(singa::kTrain, in);
- singa::CppCPU host(0, 1);
- out.ToDevice(&host);
++ out.ToHost();
+ const float *yptr = out.data<float>();
+ EXPECT_EQ(n, out.Size());
+
+ float *y = new float[n];
+ size_t div_factor = prelu.Channel_shared() ? c : 1;
+ if (prelu.Format() == "NCHW") {
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i / (h * w) % c / div_factor;
+ y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+ }
+ } else if (prelu.Format() == "NHWC") {
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i % c / div_factor;
+ y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+ }
+ }
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+
+ TEST(PReLU, BackwardGPU) {
+ const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f,
+ -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+ size_t n = sizeof(x) / sizeof(float);
+ size_t batchsize = 2, c = 3, h = 2, w = 1;
- singa::CudaGPU cuda(0, 1);
- singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>();
++ singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
+ in.CopyDataFromHostPtr<float>(x, n);
+
+ PReLU prelu;
+ singa::LayerConf conf;
+ singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+ preluconf->set_channel_shared(false);
+ preluconf->set_format("NCHW");
+ prelu.Setup(Shape{c, h, w}, conf);
+
+ const float neg_slope[] = {0.25f, 0.5f, 0.75f};
- singa::Tensor a(singa::Shape{c}, &cuda);
++ singa::Tensor a(singa::Shape{c}, cuda);
+ a.CopyDataFromHostPtr<float>(neg_slope, c);
+ prelu.Set_a(a);
+
+ singa::Tensor out = prelu.Forward(singa::kTrain, in);
+ const float grad[] = {1.f, 2.f, -2.f, -1.f, -1.f, -3.f,
+ 2.f, -2.f, 1.f, 1.f, -2.f, 0.f};
- singa::Tensor out_diff(singa::Shape{batchsize, c, h, w}, &cuda);
++ singa::Tensor out_diff(singa::Shape{batchsize, c, h, w}, cuda);
+ out_diff.CopyDataFromHostPtr<float>(grad, n);
+ const auto ret = prelu.Backward(singa::kTrain, out_diff);
+
+ singa::Tensor in_diff = ret.first;
- singa::CppCPU host(0, 1);
- in_diff.ToDevice(&host);
++ in_diff.ToHost();
+ const float *xptr = in_diff.data<float>();
+ singa::Tensor a_diff = ret.second.at(0);
- a_diff.ToDevice(&host);
++ a_diff.ToHost();
+ const float *aptr = a_diff.data<float>();
+ float *dx = new float[n];
+ size_t div_factor = prelu.Channel_shared() ? c : 1;
+ size_t params = prelu.Channel_shared() ? 1 : c;
+ float da[] = {0.f, 0.f, 0.f};
+ if (prelu.Format() == "NCHW") {
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i / (h * w) % c / div_factor;
+ dx[i] = grad[i] *
+ (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+ }
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i / (h * w) % c / div_factor;
+ da[pos] += grad[i] * std::min(x[i], 0.f);
+ }
+ } else if (prelu.Format() == "NHWC") {
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i % c / div_factor;
+ dx[i] = grad[i] *
+ (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+ }
+ for (size_t i = 0; i < n; i++) {
+ size_t pos = i % c / div_factor;
+ da[pos] += grad[i] * std::min(x[i], 0.f);
+ }
+ }
+ for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+ for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
+ }
+ #endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_rmsprop.cc
----------------------------------------------------------------------
diff --cc test/singa/test_rmsprop.cc
index 0000000,004a9b6..18de9c3
mode 000000,100644..100644
--- a/test/singa/test_rmsprop.cc
+++ b/test/singa/test_rmsprop.cc
@@@ -1,0 -1,106 +1,105 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
-#include "singa_config.h"
+ #include <cmath>
+
+ TEST(RMSProp, ApplyCPU) {
+ singa::RMSProp rmsprop;
+ float lr = 0.1f;
+ float rho = 0.9;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::OptimizerConf conf;
+ conf.set_rho(rho);
+ conf.set_delta(1E-8);
+
+ singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ rmsprop.Setup(conf);
+ rmsprop.Apply(0, lr, "xx", grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ const float* newv1 = v1.data<float>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv1[i], v[i] - g[i] * lr / sqrt(history[i] + (float)1E-8),
+ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ rmsprop.Apply(1, lr, "xx", grad, &value);
+ singa::Tensor v2 = value.Clone();
+ const float* newv2 = v2.data<float>();
+ for (int i = 0; i < 4; ++i)
+ history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv2[i], newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8),
+ 1e-5);
+ }
+
+ #ifdef USE_CUDA
+ TEST(RMSProp, ApplyCUDA) {
+ singa::RMSProp rmsprop;
+ float lr = 0.1f;
+ float rho = 0.02;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::OptimizerConf conf;
+ conf.set_rho(rho);
+ conf.set_delta(1e-8);
+
- singa::CudaGPU dev;
- singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ rmsprop.Setup(conf);
+ rmsprop.Apply(0, lr, "xx", grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ v1.ToHost();
+ const float* newv1 = v1.data<float>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ rmsprop.Apply(1, lr, "xx", grad, &value);
+ singa::Tensor v2 = value.Clone();
+ v2.ToHost();
+ const float* newv2 = v2.data<float>();
+ for (int i = 0; i < 4; ++i)
+ history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv2[i],
+ newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+ #endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_sgd.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_tensor.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --cc test/singa/test_tensor_math.cc
index 0f998c0,a40a848..f8d0351
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@@ -253,12 -507,21 +507,21 @@@ TEST_F(TestTensorMath, SumColumnsCpp)
}
#endif
#ifdef USE_CUDA
+ TEST_F(TestTensorMath, L2Cuda) {
- singa::CudaGPU dev;
- Tensor t(Shape{3, 2}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ Tensor t(Shape{3, 2}, dev);
+ t.CopyDataFromHostPtr(dat1, 6);
+ float l2 = t.L2();
+ float target = 0.0f;
+ for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
+ EXPECT_FLOAT_EQ(l2, sqrt(target));
+ }
TEST_F(TestTensorMath, MultCuda) {
const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
- singa::CudaGPU dev;
- Tensor t(Shape{2, 2}, &dev);
+ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2, 2}, dev);
t.CopyDataFromHostPtr(x, 4);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
d.CopyDataFromHostPtr(dat1, 6);
Tensor C = Mult(d, t);
C.ToHost();
@@@ -302,20 -565,18 +565,20 @@@
EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
}
}
- d.ToHost();
- p.ToHost();
++ d.ToHost();
++ p.ToHost();
}
TEST_F(TestTensorMath, AddColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
t.CopyDataFromHostPtr(x, 3);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
AddColumn(t, &d);
d.ToHost();
- const float *xptr = d.data<const float *>();
+ const float *xptr = d.data<float>();
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 2; j++) {
EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[i]);
@@@ -323,17 -584,16 +586,16 @@@
}
}
-
TEST_F(TestTensorMath, SubColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
t.CopyDataFromHostPtr(x, 3);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
SubColumn(t, &d);
d.ToHost();
- const float *xptr = d.data<const float *>();
+ const float *xptr = d.data<float>();
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 2; j++) {
EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[i]);
@@@ -357,14 -617,14 +619,14 @@@ TEST_F(TestTensorMath, MultColumnCpp)
#ifdef USE_CUDA
TEST_F(TestTensorMath, MultColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
t.CopyDataFromHostPtr(x, 3);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
MultColumn(t, &d);
d.ToHost();
- const float *xptr = d.data<const float *>();
+ const float *xptr = d.data<float>();
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 2; j++) {
EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
@@@ -373,14 -633,14 +635,14 @@@
}
TEST_F(TestTensorMath, DivColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
t.CopyDataFromHostPtr(x, 3);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
DivColumn(t, &d);
d.ToHost();
- const float *xptr = d.data<const float *>();
+ const float *xptr = d.data<float>();
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 2; j++) {
EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[i]);
@@@ -389,14 -649,14 +651,14 @@@
}
TEST_F(TestTensorMath, AddRowCuda) {
const float x[2] = {1.1f, 2.1f};
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
t.CopyDataFromHostPtr(x, 2);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
AddRow(t, &d);
d.ToHost();
- const float *xptr = d.data<const float *>();
+ const float *xptr = d.data<float>();
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 2; j++) {
EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[j]);
@@@ -405,14 -665,14 +667,14 @@@
}
TEST_F(TestTensorMath, SubRowCuda) {
const float x[2] = {1.1f, 2.1f};
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
t.CopyDataFromHostPtr(x, 2);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
SubRow(t, &d);
d.ToHost();
- const float *xptr = d.data<const float *>();
+ const float *xptr = d.data<float>();
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 2; j++) {
EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[j]);
@@@ -421,14 -681,14 +683,14 @@@
}
TEST_F(TestTensorMath, MultRowCuda) {
const float x[2] = {1.1f, 2.1f};
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
t.CopyDataFromHostPtr(x, 2);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
MultRow(t, &d);
d.ToHost();
- const float *xptr = d.data<const float *>();
+ const float *xptr = d.data<float>();
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 2; j++) {
EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[j]);
@@@ -452,14 -712,14 +714,14 @@@ TEST_F(TestTensorMath, DivRowCpp)
#ifdef USE_CUDA
TEST_F(TestTensorMath, DivRowCuda) {
const float x[2] = {1.1f, 2.1f};
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
t.CopyDataFromHostPtr(x, 2);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
DivRow(t, &d);
d.ToHost();
- const float *xptr = d.data<const float *>();
+ const float *xptr = d.data<float>();
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 2; j++) {
EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
@@@ -467,13 -727,13 +729,13 @@@
}
}
TEST_F(TestTensorMath, SumRowsCuda) {
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{2}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{2}, dev);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
SumRows(d, &t);
t.ToHost();
- const float *tptr = t.data<const float *>();
+ const float *tptr = t.data<float>();
for (int i = 0; i < 2; i++) {
float tmp = 0;
for (int j = 0; j < 3; j++) {
@@@ -481,16 -741,15 +743,16 @@@
}
EXPECT_FLOAT_EQ(tptr[i], tmp);
}
- d.ToHost();
++ d.ToHost();
}
TEST_F(TestTensorMath, SumColumnCuda) {
- auto dev = std::make_shared<singa::CudaGPU>();
- singa::CudaGPU dev;
- Tensor t(Shape{3}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
+ Tensor t(Shape{3}, dev);
d.CopyDataFromHostPtr(dat1, 6);
- d.ToDevice(&dev);
+ d.ToDevice(dev);
SumColumns(d, &t);
t.ToHost();
- const float *tptr = t.data<const float *>();
+ const float *tptr = t.data<float>();
for (int i = 0; i < 3; i++) {
float tmp = 0;
for (int j = 0; j < 2; j++) {
@@@ -498,6 -757,120 +760,121 @@@
}
EXPECT_FLOAT_EQ(tptr[i], tmp);
}
- d.ToHost();
++ d.ToHost();
}
+
+ #endif
+
+ TEST_F(TestTensorMath, ConcatenateRowsCpp) {
+ d.CopyDataFromHostPtr<float>(dat1, 6);
+ e.CopyDataFromHostPtr<float>(dat2, 6);
+ const auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+ EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+ EXPECT_EQ(ret.shape(1), d.shape(1));
+ const float *retPtr = ret.data<float>();
+ for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+ for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+ }
+
+ TEST_F(TestTensorMath, ConcatenateColumnsCpp) {
+ d.CopyDataFromHostPtr<float>(dat1, 6);
+ e.CopyDataFromHostPtr<float>(dat2, 6);
+ const auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+ EXPECT_EQ(ret.shape(0), d.shape(0));
+ EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+
+ const float *retPtr = ret.data<float>();
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 2; j++)
+ EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+ for (int j = 0; j < 2; j++)
+ EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+ }
+ }
+
+ TEST_F(TestTensorMath, CopyRowsCpp) {
+ const auto ret = singa::CopyRows(e, 1, 2);
+ EXPECT_EQ(ret.shape(0), 1u);
+ EXPECT_EQ(ret.shape(1), e.shape(1));
+ const float *retPtr = ret.data<float>();
+ for (size_t i = 0; i < ret.Size(); i++)
+ EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+ }
+
+ TEST_F(TestTensorMath, CopyColumnsCpp) {
+ a.Reshape(Shape{2, 3});
+ const auto ret = singa::CopyColumns(a, 1, 3);
+ EXPECT_EQ(ret.shape(0), a.shape(0));
+ EXPECT_EQ(ret.shape(1), 2u);
+ const float *retPtr = ret.data<float>();
+ for (size_t i = 0; i < ret.shape(0); i++)
+ for (size_t j = 0; j < ret.shape(1); j++)
+ EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+ dat1[i * a.shape(1) + j + 1]);
+ }
+
+ #ifdef USE_CUDA
+
+ TEST_F(TestTensorMath, ConcatenateRowsCuda) {
- singa::CudaGPU dev;
- d.ToDevice(&dev);
- e.ToDevice(&dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ d.ToDevice(dev);
++ e.ToDevice(dev);
+ d.CopyDataFromHostPtr<float>(dat1, 6);
+ e.CopyDataFromHostPtr<float>(dat2, 6);
+ auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+ EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+ EXPECT_EQ(ret.shape(1), d.shape(1));
+ ret.ToHost();
+ const float *retPtr = ret.data<float>();
+ for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+ for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+ }
+
+ TEST_F(TestTensorMath, ConcatenateColumnsCuda) {
- singa::CudaGPU dev;
- d.ToDevice(&dev);
- e.ToDevice(&dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ d.ToDevice(dev);
++ e.ToDevice(dev);
+ d.CopyDataFromHostPtr<float>(dat1, 6);
+ e.CopyDataFromHostPtr<float>(dat2, 6);
+ auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+ ret.ToHost();
+ EXPECT_EQ(ret.shape(0), d.shape(0));
+ EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+
+ const float *retPtr = ret.data<float>();
+ for (int i = 0; i < 3; i++) {
+ for (int j = 0; j < 2; j++)
+ EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+ for (int j = 0; j < 2; j++)
+ EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+ }
+ }
+
+ TEST_F(TestTensorMath, CopyRowsCuda) {
- singa::CudaGPU dev;
- e.ToDevice(&dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ e.ToDevice(dev);
+ auto ret = singa::CopyRows(e, 1, 2);
+ ret.ToHost();
+ EXPECT_EQ(ret.shape(0), 1u);
+ EXPECT_EQ(ret.shape(1), e.shape(1));
+ const float *retPtr = ret.data<float>();
+ for (size_t i = 0; i < ret.Size(); i++)
+ EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+ }
+
+ TEST_F(TestTensorMath, CopyColumnsCuda) {
- singa::CudaGPU dev;
++ auto dev = std::make_shared<singa::CudaGPU>();
+ a.Reshape(Shape{2, 3});
- a.ToDevice(&dev);
++ a.ToDevice(dev);
+ auto ret = singa::CopyColumns(a, 1, 3);
+ EXPECT_EQ(ret.shape(0), a.shape(0));
+ EXPECT_EQ(ret.shape(1), 2u);
+ ret.ToHost();
+ const float *retPtr = ret.data<float>();
+ for (size_t i = 0; i < ret.shape(0); i++)
+ for (size_t j = 0; j < ret.shape(1); j++)
+ EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+ dat1[i * a.shape(1) + j + 1]);
+ }
+
#endif
[6/6] incubator-singa git commit: Merge PR #165 for CnMeM
Posted by wa...@apache.org.
Merge PR #165 for CnMeM
Fixbugs from device type (Device* -> std::shared_ptr<Device>).
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/dd08f413
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/dd08f413
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/dd08f413
Branch: refs/heads/dev
Commit: dd08f413015878365fed32e579c1b7f4ecc81270
Parents: 5651383 9abd791
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Jun 24 13:41:02 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Fri Jun 24 14:49:56 2016 +0800
----------------------------------------------------------------------
.gitignore | 1 +
CMakeLists.txt | 9 +-
cmake/Dependencies.cmake | 12 +
cmake/Templates/singa_config.h.in | 2 +
cmake/Utils.cmake | 15 +
include/singa/core/common.h | 32 +-
include/singa/core/device.h | 18 +-
include/singa/core/memory.h | 62 +-
include/singa/core/tensor.h | 464 +++++++-------
include/singa/io/decoder.h | 56 ++
include/singa/io/encoder.h | 61 ++
include/singa/io/reader.h | 99 +++
include/singa/io/writer.h | 112 ++++
include/singa/model/initializer.h | 105 ++++
include/singa/model/layer.h | 48 +-
include/singa/model/loss.h | 47 ++
include/singa/model/optimizer.h | 59 +-
include/singa/utils/channel.h | 85 +++
include/singa/utils/timer.h | 2 +-
src/CMakeLists.txt | 18 +
src/core/device/cpp_cpu.cc | 2 +-
src/core/device/cuda_gpu.cc | 88 ++-
src/core/device/device.cc | 24 +-
src/core/memory/memory.cc | 83 +--
src/core/tensor/math_kernel.cu | 682 +++++++++++---------
src/core/tensor/math_kernel.h | 98 +--
src/core/tensor/tensor.cc | 896 ++++++++++++++++-----------
src/core/tensor/tensor_math.h | 418 +++++++------
src/core/tensor/tensor_math_cpp.h | 629 ++++++++++++++-----
src/core/tensor/tensor_math_cuda.h | 429 ++++++++++---
src/io/binfile_reader.cc | 113 ++++
src/io/binfile_writer.cc | 136 ++++
src/io/jpg2proto_encoder.cc | 83 +++
src/io/proto2jpg_decoder.cc | 75 +++
src/model/layer/activation.cc | 27 +-
src/model/layer/activation.h | 7 +-
src/model/layer/batchnorm.cc | 11 +-
src/model/layer/batchnorm.h | 12 +-
src/model/layer/convolution.cc | 13 +-
src/model/layer/convolution.h | 7 +-
src/model/layer/cudnn_activation.cc | 33 +-
src/model/layer/cudnn_activation.h | 11 +-
src/model/layer/cudnn_batchnorm.cc | 132 ++--
src/model/layer/cudnn_batchnorm.h | 40 +-
src/model/layer/cudnn_convolution.cc | 114 ++--
src/model/layer/cudnn_convolution.h | 4 +-
src/model/layer/cudnn_dropout.cc | 52 +-
src/model/layer/cudnn_dropout.h | 4 +-
src/model/layer/cudnn_lrn.cc | 78 +--
src/model/layer/cudnn_lrn.h | 32 +-
src/model/layer/cudnn_pooling.cc | 48 +-
src/model/layer/cudnn_pooling.h | 4 +-
src/model/layer/cudnn_softmax.cc | 62 +-
src/model/layer/cudnn_softmax.h | 11 +-
src/model/layer/dense.cc | 7 +-
src/model/layer/dense.h | 6 +-
src/model/layer/dropout.cc | 5 +-
src/model/layer/dropout.h | 7 +-
src/model/layer/flatten.cc | 53 ++
src/model/layer/flatten.h | 56 ++
src/model/layer/lrn.cc | 5 +-
src/model/layer/lrn.h | 13 +-
src/model/layer/pooling.cc | 13 +-
src/model/layer/pooling.h | 8 +-
src/model/layer/prelu.cc | 145 +++++
src/model/layer/prelu.h | 66 ++
src/model/layer/softmax.cc | 34 +-
src/model/layer/softmax.h | 11 +-
src/model/loss/mse.cc | 41 ++
src/model/loss/mse.h | 66 --
src/model/loss/softmax_cross_entropy.cc | 53 ++
src/model/metric/accuracy.h | 5 +-
src/model/optimizer/adagrad.cc | 41 ++
src/model/optimizer/nesterov.cc | 49 ++
src/model/optimizer/optimizer.cc | 2 +-
src/model/optimizer/rmsprop.cc | 45 ++
src/model/optimizer/sgd.cc | 10 +-
src/proto/core.proto | 7 +-
src/proto/io.proto | 37 ++
src/proto/model.proto | 26 +-
src/python/device.py | 82 +++
src/python/example_layer.py | 25 +
src/python/layer.py | 78 ++-
src/python/swig/core_device.i | 60 ++
src/python/swig/core_tensor.i | 263 ++++++++
src/python/swig/model_layer.i | 83 +++
src/python/swig/singa.i | 27 +
src/python/tensor.py | 370 +++++++++++
src/utils/channel.cc | 104 ++++
test/CMakeLists.txt | 3 +-
test/python/example_test_device.py | 36 ++
test/python/example_test_tensor.py | 179 ++++++
test/python/unittest_python.py | 139 +++++
test/singa/test_activation.cc | 13 +-
test/singa/test_adagrad.cc | 96 +++
test/singa/test_binfile_rw.cc | 95 +++
test/singa/test_channel.cc | 39 ++
test/singa/test_cpp_cpu.cc | 16 +-
test/singa/test_cross_entropy.cc | 116 ++++
test/singa/test_cudnn_activation.cc | 36 +-
test/singa/test_cudnn_batchnorm.cc | 59 +-
test/singa/test_cudnn_convolution.cc | 105 ++--
test/singa/test_cudnn_dropout.cc | 35 +-
test/singa/test_cudnn_lrn.cc | 28 +-
test/singa/test_cudnn_pooling.cc | 36 +-
test/singa/test_cudnn_softmax.cc | 130 ++--
test/singa/test_decoder.cc | 84 +++
test/singa/test_dense.cc | 480 +++++++-------
test/singa/test_dropout.cc | 17 +-
test/singa/test_flatten.cc | 143 +++++
test/singa/test_initializer.cc | 148 +++++
test/singa/test_memory.cc | 129 ++--
test/singa/test_mse.cc | 12 +-
test/singa/test_nesterov.cc | 101 +++
test/singa/test_prelu.cc | 245 ++++++++
test/singa/test_rmsprop.cc | 105 ++++
test/singa/test_sgd.cc | 32 +-
test/singa/test_softmax.cc | 36 +-
test/singa/test_tensor.cc | 14 +-
test/singa/test_tensor_math.cc | 505 +++++++++++++--
120 files changed, 8172 insertions(+), 2708 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/CMakeLists.txt
----------------------------------------------------------------------
diff --cc CMakeLists.txt
index c34b6ce,87b3a5d..7a5caf3
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -10,22 -10,23 +10,23 @@@ LIST(APPEND CMAKE_MODULE_PATH ${PROJECT
IF(UNIX OR APPLE)
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
ENDIF()
-
+ IF(CMAKE_BUILD_TYPE=Debug)
+ SET(NVCC_FLAG "${NVCC_FLAG} -g -G ")
+ ENDIF()
#message(STATUS "${CMAKE_CXX_FLAGS}")
-SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
-#message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
+SET(SINGA_INCLUDE_DIR
- #"${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${CMAKE_SOURCE_DIR}/lib/cnmen/include;${PROJECT_BINARY_DIR}")
+ "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/include;${PROJECT_BINARY_DIR}")
- #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
- #OPTION(CPU_ONLY "use GPU libs" OFF)
OPTION(USE_CBLAS "Use CBlas libs" ON)
OPTION(USE_CUDA "Use Cuda libs" ON)
-OPTION(USE_CUDNN "Use Cudnn libs" ON)
+OPTION(USE_CUDNN "Use Cudnn libs" OFF)
OPTION(USE_OPENCV "Use opencv" OFF)
OPTION(USE_LMDB "Use LMDB libs" OFF)
+ OPTION(USE_PYTHON "Generate py wrappers" OFF)
INCLUDE("cmake/Dependencies.cmake")
+ INCLUDE("cmake/Utils.cmake")
ADD_DEFINITIONS(-DUSE_CMAKE)
CONFIGURE_FILE (
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/cmake/Dependencies.cmake
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/common.h
----------------------------------------------------------------------
diff --cc include/singa/core/common.h
index e19022e,22a2b49..cb1bdca
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@@ -20,7 -20,9 +20,9 @@@
#define SINGA_CORE_COMMON_H_
#include <random>
#include <chrono>
+#include "./singa/singa_config.h"
+ #include <atomic>
+ #include <memory>
-#include "./singa_config.h"
#include "singa/utils/logging.h"
#ifdef USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/device.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/memory.h
----------------------------------------------------------------------
diff --cc include/singa/core/memory.h
index e4e1e63,db09043..c35f5d0
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@@ -19,56 -19,10 +19,58 @@@
#ifndef SINGA_CORE_MEMORY_H_
#define SINGA_CORE_MEMORY_H_
+#include "cnmem.h"
++#include "singa/singa_config.h"
+#include <mutex>
+
namespace singa {
/// Manage device memory pool including garbage collection, memory opt.
class VirtualMemory {};
+class DeviceMemPool {
- public:
- virtual void InitPool() = 0;
- virtual void Malloc(void** ptr, const size_t size) = 0;
- virtual void Free(void* ptr) = 0;
- virtual ~DeviceMemPool(){};
++ public:
++ virtual void InitPool() = 0;
++ virtual void Malloc(void** ptr, const size_t size) = 0;
++ virtual void Free(void* ptr) = 0;
++ virtual ~DeviceMemPool(){};
+};
+
++#ifdef USE_CUDA
+class CnMemPool : public DeviceMemPool {
- public:
- int status = 1;
++ public:
++ int status = 1;
+
- void InitPool();
++ void InitPool();
+
- /// numDevices: total number of available GPU cards.
- /// initSize: all devices will be allocated with this size
- /// manager_flags: pool manager flag (one for all devices)
- /// flag = 0; default flag
- /// flag = 1: Prevent the manager from growing its memory consumption
- /// flag = 2; Prevent the manager from stealing memory.
- void InitPool(int numDevices, size_t initSize, unsigned flag);
++ /// numDevices: total number of available GPU cards.
++ /// initSize: all devices will be allocated with this size
++ /// manager_flags: pool manager flag (one for all devices)
++ /// flag = 0; default flag
++ /// flag = 1: Prevent the manager from growing its memory consumption
++ /// flag = 2; Prevent the manager from stealing memory.
++ void InitPool(int numDevices, size_t initSize, unsigned flag);
+
- void Malloc(void** ptr, const size_t size);
- void Free(void* ptr);
++ void Malloc(void** ptr, const size_t size);
++ void Free(void* ptr);
+
- // release all memory and set cnmem manager to unintialized
- ~CnMemPool();
++ // release all memory and set cnmem manager to unintialized
++ ~CnMemPool();
+
- private:
- // whether the (global) memory pool has been initialized
- static bool initialized;
- // lock on the initialized variable
- static std::mutex mtx;
++ private:
++ // whether the (global) memory pool has been initialized
++ static bool initialized;
++ // lock on the initialized variable
++ static std::mutex mtx;
+};
+
+class CudaMemPool : public DeviceMemPool {
- public:
- void InitPool(){};
- void Malloc(void** ptr, const size_t size);
- void Free(void* ptr);
- ~CudaMemPool(){};
++ public:
++ void InitPool(){};
++ void Malloc(void** ptr, const size_t size);
++ void Free(void* ptr);
++ ~CudaMemPool(){};
+};
-
++#endif
} // namespace singa
#endif // SINGA_CORE_MEMORY_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --cc include/singa/core/tensor.h
index 8f73047,eb72bd3..a4f42db
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@@ -65,27 -54,28 +54,29 @@@ class Tensor
public:
~Tensor();
Tensor();
- explicit Tensor(Shape &&shape, const DataType dtype = kFloat32);
- explicit Tensor(const Shape &shape, const DataType dtype = kFloat32);
- Tensor(Shape &&shape, Device *dev, const DataType dtype = kFloat32);
- Tensor(const Shape &shape, Device *dev, const DataType dtype = kFloat32);
+ explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
+ explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
+ Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
- Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
++ Tensor(const Shape &shape, std::shared_ptr<Device> dev,
++ DataType dtype = kFloat32);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(const Tensor &from);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(Tensor &&from);
- /// For functions in xx_math.cc to access the blob.
- /// Users should not operate against Blob directly.
- /// blob_ is allocated in constructors.
- Blob *blob() const { return blob_; }
+ /// For functions in xx_math.cc to access the block.
+ /// Users should not operate against Block directly.
+ /// block_ is allocated in constructors.
+ Block *block() const { return block_; }
+ void SetBlock(Block* block);
- Device *device() const { return device_; }
+ std::shared_ptr<Device> device() const { return device_; }
- /// Return immutable Tensor values with given type.
- template <typename DType>
- DType data() const {
- return static_cast<DType>(blob()->data());
+ /// return immutable Tensor values with given type.
+ template <typename SType>
+ const SType* data() const {
+ return static_cast<const SType*>(block()->data());
}
/// data type, including kFloat16, kFloat32, kInt
@@@ -192,13 -179,22 +180,22 @@@
protected:
bool transpose_ = false;
DataType data_type_ = kFloat32;
- Device *device_ = nullptr;
+ std::shared_ptr<Device> device_ = nullptr;
- /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
- /// If you want to get an allocated Blob, use blob() instead of blob_.
- Blob *blob_ = nullptr;
- Shape shape_;
+ /// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
+ /// If you want to get an allocated Block, use block() instead of block_.
+ Block *block_ = nullptr;
+ Shape shape_ = {};
};
+ typedef Shape::iterator ShapeIter;
+ inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
+ if (len == 0) len = shape.size();
+ CHECK_LE(len, shape.size());
+ size_t v = 1;
+ for (unsigned int i = start; i < len; i++) v *= shape[i];
+ return v;
+ }
+
inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
CHECK_EQ(in1.data_type(), in2.data_type());
CHECK_EQ(in1.device()->lang(), in2.device()->lang());
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/model/layer.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/model/loss.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/utils/timer.h
----------------------------------------------------------------------
diff --cc include/singa/utils/timer.h
index a54829d,a54829d..bdd6c5c
--- a/include/singa/utils/timer.h
+++ b/include/singa/utils/timer.h
@@@ -19,7 -19,7 +19,7 @@@ class Timer
/// Return the duration since last call to Tick() or since the creation of
/// Timer. The template arg must be from Second or Millisecond or Hour.
/// The returned value is the count of the time metric.
-- template <typename T>
++ template <typename T = Milliseconds>
int Elapsed() const {
static_assert(std::is_same<T, Seconds>::value ||
std::is_same<T, Milliseconds>::value ||
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/CMakeLists.txt
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --cc src/core/device/cuda_gpu.cc
index 4da292f,5d4e1ed..5879c58
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@@ -32,8 -32,8 +32,7 @@@ const cudaMemcpyKind copyKind[] = {cuda
cudaMemcpyDeviceToDevice};
CudaGPU::~CudaGPU() {
-- if (ctx_.cublas_handle)
-- CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
++ if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
if (ctx_.curand_generator)
CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
#ifdef USE_CUDNN
@@@ -42,14 -42,13 +41,12 @@@
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
}
#endif
- delete pool;
++ delete pool;
}
--CudaGPU::CudaGPU(int id, int num_executors,
-- string scheduler, string vm)
++CudaGPU::CudaGPU(int id, int num_executors, string scheduler, string vm)
: Device(id, num_executors, scheduler, vm) {
-- if (id == -1)
-- id = FindDevice(0);
++ if (id == -1) id = FindDevice(0);
lang_ = kCuda;
ctx_.stream = NULL; // use the default sync stream
// TODO(wangwei) create one handle for each steam?
@@@ -68,62 -67,20 +65,57 @@@
auto status = cudnnCreate(&ctx_.cudnn_handle);
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
#endif // USE_CUDNN
-
- // initialize cnmem memory management as default
- pool = new CnMemPool();
- ((CnMemPool*)pool)->InitPool();
++
++ // initialize cnmem memory management as default
++ pool = new CnMemPool();
++ ((CnMemPool*)pool)->InitPool();
}
- CudaGPU::CudaGPU(const MemPoolConf& mem_conf,int id, int num_executors,
- string scheduler)
-void CudaGPU::SetRandSeed(unsigned seed) {
- CHECK(ctx_.curand_generator);
++CudaGPU::CudaGPU(const MemPoolConf& mem_conf, int id, int num_executors,
++ string scheduler)
+ : Device(id, num_executors, scheduler, "gc-only") {
- if (id == -1)
- id = FindDevice(0);
++ if (id == -1) id = FindDevice(0);
+ lang_ = kCuda;
+ ctx_.stream = NULL; // use the default sync stream
+ // TODO(wangwei) create one handle for each steam?
+ CUDA_CHECK(cudaSetDevice(FindDevice(0)));
+ // use curandCreateGeneratorHost for CudaHost device
CURAND_CHECK(
- curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
+ curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+ auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+ SetRandSeed(seed);
+ // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+ CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+ CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
+
+#ifdef USE_CUDNN
+ // TODO(wangwei) create one handle for each stream?
+ auto status = cudnnCreate(&ctx_.cudnn_handle);
+ CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif // USE_CUDNN
+
- // initialize memory management for cuda devices
- string memoryPoolType = mem_conf.type();
- if(memoryPoolType.compare("cnmem") == 0) {
- pool = new CnMemPool();
- int num_devices = mem_conf.num_devices();
- size_t alloc_size = mem_conf.alloc_size();
- unsigned flag = mem_conf.cnmemflag();
- ((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
- }
- else {
- pool = new CudaMemPool();
- }
++ // initialize memory management for cuda devices
++ string memoryPoolType = mem_conf.type();
++ if (memoryPoolType.compare("cnmem") == 0) {
++ pool = new CnMemPool();
++ int num_devices = mem_conf.num_devices();
++ size_t alloc_size = mem_conf.alloc_size();
++ unsigned flag = mem_conf.cnmemflag();
++ ((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
++ } else {
++ pool = new CudaMemPool();
++ }
}
-void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) {
- fn(&ctx_);
+void CudaGPU::SetRandSeed(unsigned seed) {
+ CHECK(ctx_.curand_generator);
- CURAND_CHECK(
- curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
++ CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
}
- void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) {
- fn(&ctx_);
- }
++void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); }
+
void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
-- CopyDirection direction, Context* ctx) {
++ CopyDirection direction, Context* ctx) {
cudaMemcpy(dst, src, nBytes, copyKind[direction]);
// TODO(wangwei) use async copy
// cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
@@@ -133,22 -90,19 +125,21 @@@
void* CudaGPU::Malloc(int size) {
void* ptr = nullptr;
if (size > 0) {
- //CUDA_CHECK(cudaMalloc((void**)&ptr,size));
- pool->Malloc((void**)&ptr,size);
- CUDA_CHECK(cudaMalloc(&ptr, size));
++ // CUDA_CHECK(cudaMalloc((void**)&ptr,size));
++ pool->Malloc((void**)&ptr, size);
CUDA_CHECK(cudaMemset(ptr, 0, size));
}
return ptr;
}
-- /// Free cpu memory.
++/// Free cpu memory.
void CudaGPU::Free(void* ptr) {
- if (ptr != nullptr)
- CUDA_CHECK(cudaFree(ptr));
+ if (ptr != nullptr) {
- //CUDA_CHECK(cudaFree(ptr));
- pool->Free(ptr);
- }
++ // CUDA_CHECK(cudaFree(ptr));
++ pool->Free(ptr);
++ }
}
--
// ==========Following code is from Caffe src/caffe/common.cpp=================
void CudaGPU::DeviceQuery() {
@@@ -169,20 -123,20 +160,18 @@@
LOG(INFO) << "Warp size: " << prop.warpSize;
LOG(INFO) << "Maximum memory pitch: " << prop.memPitch;
LOG(INFO) << "Maximum threads per block: " << prop.maxThreadsPerBlock;
-- LOG(INFO) << "Maximum dimension of block: "
-- << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
-- << prop.maxThreadsDim[2];
-- LOG(INFO) << "Maximum dimension of grid: "
-- << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
-- << prop.maxGridSize[2];
++ LOG(INFO) << "Maximum dimension of block: " << prop.maxThreadsDim[0]
++ << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2];
++ LOG(INFO) << "Maximum dimension of grid: " << prop.maxGridSize[0] << ", "
++ << prop.maxGridSize[1] << ", " << prop.maxGridSize[2];
LOG(INFO) << "Clock rate: " << prop.clockRate;
LOG(INFO) << "Total constant memory: " << prop.totalConstMem;
LOG(INFO) << "Texture alignment: " << prop.textureAlignment;
-- LOG(INFO) << "Concurrent copy and execution: "
-- << (prop.deviceOverlap ? "Yes" : "No");
++ LOG(INFO) << "Concurrent copy and execution: " << (prop.deviceOverlap ? "Yes"
++ : "No");
LOG(INFO) << "Number of multiprocessors: " << prop.multiProcessorCount;
LOG(INFO) << "Kernel execution timeout: "
-- << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
++ << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
return;
}
@@@ -203,6 -157,6 +192,5 @@@ int CudaGPU::FindDevice(const int start
return -1;
}
--
} // namespace singa
#endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/device.cc
----------------------------------------------------------------------
diff --cc src/core/device/device.cc
index 1889339,36381e4..6775e40
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@@ -22,11 -22,11 +22,11 @@@ namespace singa
Device::Device(int id, int num_executors, string scheduler, string vm)
: id_(id), num_executors_(num_executors) {
// TODO(wangwei) create scheduler and vm.
- host_ = &defaultDevice;
+ host_ = defaultDevice;
}
- void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
- const vector<Blob*> write_blobs, bool use_rand_generator) {
+ void Device::Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
+ const vector<Block*> write_blocks, bool use_rand_generator) {
// TODO(wangwei) execute operations scheduled by the scheduler.
DoExec(std::move(fn), 0);
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --cc src/core/memory/memory.cc
index 304c101,a1cf5db..7ac6792
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@@ -16,71 -16,5 +16,74 @@@
* limitations under the License.
*/
--
#include "singa/core/memory.h"
+#include "singa/utils/logging.h"
++#include "singa/proto/core.pb.h"
+#include <iostream>
+
++#ifdef USE_CUDA
+namespace singa {
-
+bool singa::CnMemPool::initialized = false;
+std::mutex singa::CnMemPool::mtx;
-
+void CnMemPool::InitPool(int numDevices, size_t initSize, unsigned flag) {
- mtx.lock();
- if(!initialized) {
- CHECK_GE(numDevices, 1);
- cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
- for(int i = 0; i < numDevices; i++) {
- settingPtr[i].device = i;
- settingPtr[i].size = initSize;
- settingPtr[i].numStreams = 0;
- settingPtr[i].streams = NULL;
- settingPtr[i].streamSizes = 0;
- }
- cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
- CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
- delete[] settingPtr;
- initialized = true;
- }
- mtx.unlock();
++ mtx.lock();
++ const size_t kNBytesPerMB = (1u << 20);
++ if (!initialized) {
++ CHECK_GE(numDevices, 1);
++ cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
++ for (int i = 0; i < numDevices; i++) {
++ settingPtr[i].device = i;
++ settingPtr[i].size = initSize * kNBytesPerMB;
++ settingPtr[i].numStreams = 0;
++ settingPtr[i].streams = NULL;
++ settingPtr[i].streamSizes = 0;
++ }
++ cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
++ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++ << " " << cnmemGetErrorString(status);
++ delete[] settingPtr;
++ initialized = true;
++ }
++ mtx.unlock();
+}
+
+void CnMemPool::InitPool() {
- int defaultNumDevices = 1;
- size_t defaultSize = 1000000U;
- InitPool(defaultNumDevices,defaultSize,cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
++ MemPoolConf conf;
++ InitPool(conf.num_devices(), conf.alloc_size(),
++ cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
+}
+
+CnMemPool::~CnMemPool() {
- mtx.lock();
- if(initialized) {
- cnmemStatus_t status = cnmemFinalize();
- CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
- initialized = false;
- }
- mtx.unlock();
++ mtx.lock();
++ if (initialized) {
++ cnmemStatus_t status = cnmemFinalize();
++ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++ << " " << cnmemGetErrorString(status);
++ initialized = false;
++ }
++ mtx.unlock();
+}
+
-
+void CnMemPool::Malloc(void** ptr, const size_t size) {
- cnmemStatus_t status = cnmemMalloc(ptr,size,NULL);
- CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
++ cnmemStatus_t status = cnmemMalloc(ptr, size, NULL);
++ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++ << " " << cnmemGetErrorString(status);
+}
+
+void CnMemPool::Free(void* ptr) {
- cnmemStatus_t status = cnmemFree(ptr,NULL);
- CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
++ cnmemStatus_t status = cnmemFree(ptr, NULL);
++ CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++ << " " << cnmemGetErrorString(status);
+}
+
+void CudaMemPool::Malloc(void** ptr, const size_t size) {
- cudaError_t status = cudaMalloc(ptr,size);
- CHECK_EQ(status, cudaError_t::cudaSuccess);
++ cudaError_t status = cudaMalloc(ptr, size);
++ CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+void CudaMemPool::Free(void* ptr) {
- cudaError_t status = cudaFree(ptr);
- CHECK_EQ(status, cudaError_t::cudaSuccess);
++ cudaError_t status = cudaFree(ptr);
++ CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
-
+}
++#endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor.cc
index a5b43d8,9b3eeff..b852a54
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@@ -25,54 -25,66 +25,65 @@@
namespace singa {
Tensor::~Tensor() {
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
- blob_ = nullptr;
- // LOG(ERROR) << "~";
+ if (block_ != nullptr && block_->DecRefCount() == 0)
+ device_->FreeBlock(block_);
+ block_ = nullptr;
}
-Tensor::Tensor() { device_ = &defaultDevice; }
+Tensor::Tensor() { device_ = defaultDevice; }
-Tensor::Tensor(const Shape &shape, const DataType dtype)
- : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
- device_ = &defaultDevice;
+Tensor::Tensor(const Shape &shape, DataType dtype)
+ : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+ device_ = defaultDevice;
- blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+ block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(Shape &&shape, const DataType dtype)
- : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
- device_ = &defaultDevice;
+Tensor::Tensor(Shape &&shape, DataType dtype)
+ : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+ device_ = defaultDevice;
- blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+ block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(const Shape &shape, Device *device, const DataType dtype)
+Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
- blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+ block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(Shape &&shape, Device *device, const DataType dtype)
+Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
- blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
- }
- Tensor::Tensor(const Tensor &t)
- : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
- blob_(t.blob()), shape_(t.shape_) {
- blob_->IncRefCount();
- // LOG(ERROR) << "const&";
- }
-
- Tensor::Tensor(Tensor &&t)
- : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
- shape_(std::move(t.shape_)) {
- blob_ = t.blob_;
- t.blob_ = nullptr;
- // LOG(ERROR) << "&&";
- }
-
- void Tensor::ResetLike(const Tensor &t) {
- if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
- shape_ = t.shape_;
- device_ = t.device_;
- data_type_ = t.data_type_;
- blob_ = device_->NewBlob(t.MemSize());
+ block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
+ }
+ Tensor::Tensor(const Tensor &in)
+ : transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ block_(in.block()),
+ shape_(in.shape_) {
+ block_->IncRefCount();
+ }
+
+ Tensor::Tensor(Tensor &&in)
+ : transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ shape_(std::move(in.shape_)) {
+ block_ = in.block_;
+ in.block_ = nullptr;
+ }
+
+ void Tensor::SetBlock(Block* block) {
+ LOG(WARNING) << "Pls avoid using this function, which may have side-effect.";
+ if (block_ != nullptr)
+ if (block_->DecRefCount())
+ device_->FreeBlock(block_);
+ block_ = block;
+ }
+
+ void Tensor::ResetLike(const Tensor &in) {
+ if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+ if (block_ != nullptr && block_->DecRefCount() == 0)
+ device_->FreeBlock(block_);
+ shape_ = in.shape_;
+ device_ = in.device_;
+ data_type_ = in.data_type_;
+ block_ = device_->NewBlock(in.MemSize());
}
}
@@@ -228,13 -245,13 +244,13 @@@ void CopyDataToFrom(Tensor *dst, const
auto width = SizeOf(src.data_type());
CHECK_EQ(width, SizeOf(dst->data_type()));
size_t nBytes = num * width;
- dst_offset *= width;
- src_offset *= width;
- CHECK_GE(src.MemSize(), src_offset + nBytes);
- CHECK_GE(dst->MemSize(), dst_offset + nBytes);
+ auto d_offset = dst_offset * width;
+ auto s_offset = src_offset * width;
+ CHECK_GE(src.MemSize(), s_offset + nBytes);
+ CHECK_GE(dst->MemSize(), d_offset + nBytes);
- Device *src_dev = src.device(), *dst_dev = dst->device();
+ std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
- Blob *from = src.blob(), *to = dst->blob();
+ Block *from = src.block(), *to = dst->block();
if (dst_dev->lang() != src_dev->lang()) {
// let the none cpp device conduct copy op
if (dst_dev->lang() == kCpp) {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/batchnorm.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/batchnorm.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_activation.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_batchnorm.cc
index 8288a41,0e597fe..a1e9e50
--- a/src/model/layer/cudnn_batchnorm.cc
+++ b/src/model/layer/cudnn_batchnorm.cc
@@@ -30,7 -30,7 +30,7 @@@ CudnnBatchNorm::~CudnnBatchNorm()
}
}
--void CudnnBatchNorm::ToDevice(Device* device) {
++void CudnnBatchNorm::ToDevice(std::shared_ptr<Device> device) {
BatchNorm::ToDevice(device);
resultSaveMean_.ToDevice(device);
resultSaveVariance_.ToDevice(device);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_batchnorm.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_batchnorm.h
index 8598b65,36dbbce..4f46452
--- a/src/model/layer/cudnn_batchnorm.h
+++ b/src/model/layer/cudnn_batchnorm.h
@@@ -29,31 -29,29 +29,29 @@@
namespace singa {
class CudnnBatchNorm : public BatchNorm {
public:
- ~CudnnBatchNorm();
- /// \copy doc Layer::layer_type()
- const std::string layer_type() const override {
- return "CudnnBatchNorm";
- }
+ ~CudnnBatchNorm();
+ /// \copy doc Layer::layer_type()
+ const std::string layer_type() const override { return "CudnnBatchNorm"; }
- void Setup(const LayerConf& conf) override;
+ void Setup(const Shape& in_sample, const LayerConf& conf) override;
- const Tensor Forward(int flag, const Tensor& input)
- override;
- const std::pair<Tensor, vector<Tensor>> Backward(
- int flag, const Tensor& grad) override;
+ const Tensor Forward(int flag, const Tensor& input) override;
+ const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+ const Tensor& grad) override;
- void ToDevice(Device* device) override;
++ void ToDevice(std::shared_ptr<Device> device) override;
- /// Init cudnn related data structures.
- void InitCudnn(const Shape& shape, DataType dtype);
- void ToDevice(Device* device) override;
+ private:
+ /// Init cudnn related data structures.
+ void InitCudnn(const Shape& shape, DataType dtype);
private:
- bool has_init_cudnn_ = false;
- cudnnBatchNormMode_t mode_;
- cudnnLRNDescriptor_t lrn_desc_;
- cudnnTensorDescriptor_t shape_desc_, param_desc_;
- Tensor resultSaveMean_, resultSaveVariance_;
-
- }; // class CudnnBatchNorm
+ bool has_init_cudnn_ = false;
+ cudnnBatchNormMode_t mode_;
+ cudnnLRNDescriptor_t lrn_desc_ = nullptr;
+ cudnnTensorDescriptor_t shape_desc_ = nullptr, param_desc_ = nullptr;
+ Tensor resultSaveMean_, resultSaveVariance_;
+
+ }; // class CudnnBatchNorm
} // namespace
#endif // USE_CUDNN
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_convolution.cc
index b80c3bd,8cdfc07..d5ac2a3
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@@ -46,7 -46,7 +46,7 @@@ void CudnnConvolution::Setup(const Shap
"limited_workspace, no_workspace and autotune";
}
--void CudnnConvolution::ToDevice(Device *device) {
++void CudnnConvolution::ToDevice(std::shared_ptr<Device> device) {
weight_.ToDevice(device);
bias_.ToDevice(device);
workspace_.ToDevice(device);
@@@ -55,7 -55,7 +55,7 @@@
void CudnnConvolution::InitCudnn(const Tensor &input) {
CHECK(!has_init_cudnn_);
DataType dtype = input.data_type();
-- Device *dev = input.device();
++ auto dev = input.device();
Context *ctx = dev->context(0);
size_t batchsize = input.shape(0);
CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
@@@ -161,7 -161,7 +161,7 @@@ const Tensor CudnnConvolution::Forward(
if (flag & kTrain) buf_.push(input); // buffer the input for backward
size_t batchsize = input.shape()[0];
DataType dtype = input.data_type();
-- Device *dev = input.device();
++ auto dev = input.device();
if (!has_init_cudnn_) InitCudnn(input);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_convolution.h
index 152d797,6c15839..cd0471f
--- a/src/model/layer/cudnn_convolution.h
+++ b/src/model/layer/cudnn_convolution.h
@@@ -41,9 -41,9 +41,9 @@@ class CudnnConvolution : public Convolu
const Tensor &grad) override;
/// \copydoc Layer::Setup(const LayerConf&);
- void Setup(const LayerConf &conf) override;
+ void Setup(const Shape& in_sample, const LayerConf &conf) override;
-- void ToDevice(Device *device) override;
++ void ToDevice(std::shared_ptr<Device> device) override;
size_t workspace_byte_limit() { return workspace_byte_limit_; }
string prefer() { return prefer_; }
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_dropout.cc
index 64a581b,877dd12..2e2e12b
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@@ -34,8 -34,8 +34,8 @@@ CudnnDropout::~CudnnDropout()
if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
}
--void CudnnDropout::InitCudnn(int size, DataType dtype, Device* dev,
-- Context* ctx) {
++void CudnnDropout::InitCudnn(int size, DataType dtype,
++ std::shared_ptr<Device> dev, Context* ctx) {
CHECK(!has_init_cudnn_);
CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
@@@ -65,13 -65,11 +65,11 @@@ const Tensor CudnnDropout::Forward(int
if (flag & kTrain) {
auto size = input.Size();
DataType dtype = input.data_type();
-- Device* dev = input.device();
++ auto dev = input.device();
if (!has_init_cudnn_) {
- input.device()->Exec(
- [size, dtype, this, dev](Context* ctx) {
- this->InitCudnn(size, dtype, dev, ctx);
- },
- {}, {this->state_.blob()});
+ input.device()->Exec([size, dtype, this, dev](Context* ctx) {
+ this->InitCudnn(size, dtype, dev, ctx);
+ }, {}, {this->state_.block()});
}
Tensor output;
output.ResetLike(input);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_dropout.h
index da3d1d2,83572cf..6809653
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@@ -42,8 -42,9 +42,10 @@@ class CudnnDropout : public Dropout
const std::pair<Tensor, vector<Tensor>> Backward(int flag,
const Tensor& grad) override;
+ private:
/// Init cudnn related data structures.
-- void InitCudnn(int size, DataType dtype, Device* dev, Context* ctx);
++ void InitCudnn(int size, DataType dtype, std::shared_ptr<Device> dev,
++ Context* ctx);
private:
bool has_init_cudnn_ = false;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_lrn.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_pooling.cc
index 842685d,9d288c0..6d7a5b1
--- a/src/model/layer/cudnn_pooling.cc
+++ b/src/model/layer/cudnn_pooling.cc
@@@ -82,7 -82,7 +82,7 @@@ const Tensor CudnnPooling::Forward(int
CHECK_EQ(input.nDim(), 4u);
size_t batchsize = input.shape(0);
DataType dtype = input.data_type();
-- Device *dev = input.device();
++ auto dev = input.device();
if (!has_init_cudnn_) InitCudnn(input);
Shape shape{batchsize, channels_, pooled_height_, pooled_width_};
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dense.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dense.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dropout.cc
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dropout.h
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/prelu.cc
----------------------------------------------------------------------
diff --cc src/model/layer/prelu.cc
index 0000000,83a56fa..6eb09d9
mode 000000,100644..100644
--- a/src/model/layer/prelu.cc
+++ b/src/model/layer/prelu.cc
@@@ -1,0 -1,145 +1,145 @@@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #include "singa/model/layer.h"
+ #include "./prelu.h"
+ namespace singa {
+
+ void PReLU::Setup(const Shape& in_sample, const LayerConf &conf) {
+ Layer::Setup(in_sample, conf);
+ out_sample_shape_ = in_sample;
+ channel_shared_ = conf.prelu_conf().channel_shared();
+ format_ = conf.prelu_conf().format();
+ // Push back params into param_values_
+ for (const auto &spec : conf.param()) param_specs_.push_back(spec);
+ param_values_.push_back(&a_);
+ }
+
+ const Tensor PReLU::Forward(int flag, const Tensor &input) {
+ Tensor output;
+ if (!channel_shared_) {
+ size_t n, c, h, w;
+ Tensor temp = (input <= 0.f);
+ if (temp.nDim() == 4) {
+ if (format_ == "NCHW") {
+ n = temp.shape(0);
+ c = temp.shape(1);
+ h = temp.shape(2);
+ w = temp.shape(3);
+ temp.Reshape(Shape{n * c, h * w});
+ Tensor temp_a(Shape{n, c}, input.device(), input.data_type());
+ Uniform(1.f, 1.f, &temp_a);
+ MultRow(a_, &temp_a);
+ temp_a.Reshape(Shape{n * c});
+ MultColumn(temp_a, &temp);
+ } else if (format_ == "NHWC") {
+ n = temp.shape(0);
+ h = temp.shape(1);
+ w = temp.shape(2);
+ c = temp.shape(3);
+ temp.Reshape(Shape{n * h * w, c});
+ MultRow(a_, &temp);
+ } else {
+ LOG(FATAL) << "Incorrect input format for prelu layer.";
+ }
+ } else {
+ LOG(FATAL) << "Incorrect input format for prelu layer.";
+ }
+ output = input * ((input > 0.f) + temp);
+ } else {
+ // share the first param of Tensor A along all channels
+ LOG(FATAL) << "Not implemented";
+ // TODO(wangwei) cannot access the data in this way. The data could be on GPU.
+ auto a = a_.data<float>()[0];
+ output = input * ((input > 0.f) + (input <= 0.f) * a);
+ }
+ if (flag & kTrain) buf_.push(input);
+ return output;
+ }
+
+ const std::pair<Tensor, vector<Tensor> > PReLU::Backward(int flag,
+ const Tensor &grad) {
+ vector<Tensor> param_grad;
+ CHECK(!buf_.empty());
+ Tensor input_grad, input = buf_.top();
+ buf_.pop();
+ Tensor da;
+ da.ResetLike(a_);
+ if (!channel_shared_) {
+ size_t n, c, h, w;
+ Tensor temp1 = (input <= 0.f);
+ if (temp1.nDim() == 4) {
+ if (format_ == "NCHW") {
+ n = temp1.shape(0);
+ c = temp1.shape(1);
+ h = temp1.shape(2);
+ w = temp1.shape(3);
+ temp1.Reshape(Shape{n * c, h * w});
+ Tensor temp_a(Shape{n, c}, grad.device(), grad.data_type());
+ Uniform(1.f, 1.f, &temp_a);
+ MultRow(a_, &temp_a);
+ temp_a.Reshape(Shape{n * c});
+ MultColumn(temp_a, &temp1);
+ temp1.Reshape(Shape{n, c, h, w});
+ } else if (format_ == "NHWC") {
+ n = temp1.shape(0);
+ h = temp1.shape(1);
+ w = temp1.shape(2);
+ c = temp1.shape(3);
+ temp1.Reshape(Shape{n * h * w, c});
+ MultRow(a_, &temp1);
+ temp1.Reshape(Shape{n, h, w, c});
+ } else {
+ LOG(FATAL) << "Incorrect input format for prelu layer.";
+ }
+ } else {
+ LOG(FATAL) << "Incorrect input format for prelu layer.";
+ }
+ input_grad = grad * input * ((input > 0.f) + temp1);
+ Tensor temp2 = grad * input * (input <= 0.f);
+ if (format_ == "NCHW") {
+ Tensor temp3(Shape{n * c}, grad.device(), grad.data_type());
+ temp2.Reshape(Shape{n * c, h * w});
+ SumColumns(temp2, &temp3);
+ temp3.Reshape(Shape{n, c});
+ SumRows(temp3, &da);
+ } else if (format_ == "NHWC") {
+ temp2.Reshape(Shape{n * h * w, c});
+ SumRows(temp2, &da);
+ }
+ } else {
+ // share the first param of Tensor A along all channels
+ LOG(FATAL) << "Not Implemented";
+ // TODO(wangwei) cannot access the data in this way. The data could be on GPU.
+ auto a = a_.data<float>()[0];
+ input_grad = grad * input * ((input > 0.f) + (input <= 0.f) * a);
+ Tensor temp = grad * input * (input <= 0.f);
+ float sum = Sum<float>(temp);
+ Uniform(1.f, 1.f, &da);
+ da *= sum;
+ }
+ param_grad.push_back(da);
+ return std::make_pair(input_grad, param_grad);
+ }
+
-void PReLU::ToDevice(Device *device) {
++void PReLU::ToDevice(std::shared_ptr<Device> device) {
+ Layer::ToDevice(device);
+ a_.ToDevice(device);
+ }
+
+ } // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/prelu.h
----------------------------------------------------------------------
diff --cc src/model/layer/prelu.h
index 0000000,ee571e1..70a9dcf
mode 000000,100644..100644
--- a/src/model/layer/prelu.h
+++ b/src/model/layer/prelu.h
@@@ -1,0 -1,66 +1,66 @@@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ #ifndef SINGA_MODEL_LAYER_PRELU_H_
+ #define SINGA_MODEL_LAYER_PRELU_H_
+ #include <utility>
+ #include <string>
+ #include <vector>
+ #include "singa/model/layer.h"
-#include "singa_config.h"
++#include "singa/singa_config.h"
+
+ namespace singa {
+ class PReLU : public Layer {
+ public:
+ /// \copydoc Layer::layer_type()
+ const std::string layer_type() const override { return "PReLU"; }
+
+
+ /// \copydoc Layer::Setup(const LayerConf&);
+ void Setup(const Shape& in_sample, const LayerConf& conf) override;
+ const Shape GetOutputSampleShape() const override {
+ CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+ return out_sample_shape_;
+ }
+
+ /// \copydoc Layer::Forward(int flag, const Tensor&)
+ const Tensor Forward(int flag, const Tensor &input) override;
+
+ /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+ const std::pair<Tensor, vector<Tensor> > Backward(
+ int flag, const Tensor &grad) override;
+
- void ToDevice(Device *device);
++ void ToDevice(std::shared_ptr<Device> device);
+
+ const bool Channel_shared() const { return channel_shared_; }
+ const Tensor A() const { return a_; }
+ const std::string Format() const { return format_; }
+
+ void Set_a(Tensor a) {
+ a_.ResetLike(a);
+ a_.CopyData(a);
+ }
+
+ protected:
+ bool channel_shared_;
+ std::string format_; // format_ has two valid value, i.e. NCHW, NHWC
+ Tensor a_; // shape of a_ is 2D, i.e. (channels, 1)
+ std::stack<Tensor> buf_;
+ Shape out_sample_shape_;
+ };
+ } // namespace singa
+ #endif // SINGA_MODEL_LAYER_PRELU_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/proto/core.proto
----------------------------------------------------------------------
diff --cc src/proto/core.proto
index cf6e193,3031359..b853b30
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@@ -44,16 -45,3 +45,16 @@@ enum CopyDirection
kDeviceToDevice = 3;
kNumDirection = 4;
}
+
+// configuration for device memory pool
+message MemPoolConf {
+ optional string type = 1 [default = "cnmem"];
+ optional uint32 num_devices = 2 [default = 1];
- // allocation size for each device
- optional uint32 alloc_size = 3 [default = 10000000];
++ // allocation size for each device, default is 256 MB
++ optional uint32 alloc_size = 3 [default = 256];
+ // memory manager flag for cnmem
+ // cnmemflag = 0: default flag
+ // cnmemflag = 1: prevent the manager from growing its memory consumption
+ // cnmemflag = 2: prevent the manager from stealing memory
+ optional uint32 cnmemflag = 4 [default = 0];
+}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_adagrad.cc
----------------------------------------------------------------------
diff --cc test/singa/test_adagrad.cc
index 0000000,642e929..c45dcef
mode 000000,100644..100644
--- a/test/singa/test_adagrad.cc
+++ b/test/singa/test_adagrad.cc
@@@ -1,0 -1,96 +1,96 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
-#include "singa_config.h"
++#include "singa/singa_config.h"
+ #include <cmath>
+
+ TEST(Adagrad, ApplyCPU) {
+ singa::Adagrad adagrad;
+ float lr = 0.1f;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+ singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ singa::OptimizerConf conf;
+ adagrad.Setup(conf);
+ adagrad.Apply(0, lr, "xx", grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ const float* newv1 = v1.data<float>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ adagrad.Apply(1, lr, "xx", grad, &value);
+ singa::Tensor v2 = value.Clone();
+ const float* newv2 = v2.data<float>();
+ for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv2[i],
+ newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+
+ #ifdef USE_CUDA
+ TEST(Adagrad, ApplyCUDA) {
+ singa::Adagrad adagrad;
+ float lr = 0.1f;
+ const float v[4] = {0.1, 0.2, 0.3, 0.4};
+ const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
- singa::CudaGPU dev;
- singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+ value.CopyDataFromHostPtr(v, 4);
+ grad.CopyDataFromHostPtr(g, 4);
+
+ singa::OptimizerConf conf;
+ adagrad.Setup(conf);
+ adagrad.Apply(0, lr, "xx", grad, &value);
+
+ singa::Tensor v1 = value.Clone();
+ v1.ToHost();
+ const float* newv1 = v1.data<float>();
+ float history[4];
+ for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+ for (int i = 0; i < 4; ++i)
+ EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+ 1e-5);
+
+ grad.CopyDataFromHostPtr(g, 4);
+ adagrad.Apply(1, lr, "xx", grad, &value);
+ singa::Tensor v2 = value.Clone();
+ v2.ToHost();
+ const float* newv2 = v2.data<float>();
+ for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+ for (int i = 0; i < 4; ++i)
+ EXPECT_FLOAT_EQ(newv2[i],
+ newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()));
+ }
+ #endif
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cross_entropy.cc
index 0000000,ce60f7c..d73591f
mode 000000,100644..100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@@ -1,0 -1,116 +1,116 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+ #include "gtest/gtest.h"
+ #include "singa/core/tensor.h"
+ #include "singa/core/device.h"
+ #include "singa/model/loss.h"
-#include "singa_config.h"
++#include "singa/singa_config.h"
+
+ using singa::Tensor;
+ class TestSoftmaxCrossEntropy : public ::testing::Test {
+ protected:
+ virtual void SetUp() {
+ p.Reshape(singa::Shape{2, 4});
+ t.Reshape(singa::Shape{2, 1});
+ }
+ const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+ const int tdat[2] = {0, 2};
+
+ singa::Tensor p, t;
+ };
+
+ TEST_F(TestSoftmaxCrossEntropy, CppForward) {
+ p.CopyDataFromHostPtr(pdat, 8);
+ t.AsType(singa::kInt);
+ t.CopyDataFromHostPtr(tdat, 2);
+
+ singa::SoftmaxCrossEntropy cross_entropy;
+ const Tensor& loss = cross_entropy.Forward(p, t);
+ auto ldat = loss.data<float>();
+
+ const float result_test = -log(0.25);
+ EXPECT_FLOAT_EQ(ldat[0], result_test);
+ EXPECT_FLOAT_EQ(ldat[1], result_test);
+ }
+
+ TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
+ p.CopyDataFromHostPtr(pdat, 8);
+ t.AsType(singa::kInt);
+ t.CopyDataFromHostPtr(tdat, 2);
+
+ singa::SoftmaxCrossEntropy cross_entropy;
+ cross_entropy.Forward(p, t);
+ const Tensor& grad = cross_entropy.Backward();
+
+ auto gdat = grad.data<float>();
+ EXPECT_FLOAT_EQ(gdat[0], -0.75);
+ EXPECT_FLOAT_EQ(gdat[1], 0.25);
+ EXPECT_FLOAT_EQ(gdat[2], 0.25);
+ EXPECT_FLOAT_EQ(gdat[3], 0.25);
+ EXPECT_FLOAT_EQ(gdat[4], 0.25);
+ EXPECT_FLOAT_EQ(gdat[5], 0.25);
+ EXPECT_FLOAT_EQ(gdat[6], -0.75);
+ EXPECT_FLOAT_EQ(gdat[7], 0.25);
+ }
+
+ #ifdef USE_CUDA
+
+ TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
+ singa::SoftmaxCrossEntropy cross_entropy;
- singa::CudaGPU dev;
- p.ToDevice(&dev);
- t.ToDevice(&dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ p.ToDevice(dev);
++ t.ToDevice(dev);
+ p.CopyDataFromHostPtr(pdat, 8);
+ t.CopyDataFromHostPtr(tdat, 2);
+
+ Tensor loss = cross_entropy.Forward(p, t);
+ loss.ToHost();
+ auto ldat = loss.data<float>();
+
+ const float result_test = -log(0.25);
+ EXPECT_FLOAT_EQ(ldat[0], result_test);
+ EXPECT_FLOAT_EQ(ldat[1], result_test);
+ }
+
+ TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
+ singa::SoftmaxCrossEntropy cross_entropy;
- singa::CudaGPU dev;
- p.ToDevice(&dev);
- t.ToDevice(&dev);
++ auto dev = std::make_shared<singa::CudaGPU>();
++ p.ToDevice(dev);
++ t.ToDevice(dev);
+ p.CopyDataFromHostPtr(pdat, 8);
+ t.CopyDataFromHostPtr(tdat, 2);
+
+ cross_entropy.Forward(p, t);
+ Tensor grad = cross_entropy.Backward();
+
+ grad.ToHost();
+ auto gdat = grad.data<float>();
+ EXPECT_FLOAT_EQ(gdat[0], -0.75);
+ EXPECT_FLOAT_EQ(gdat[1], 0.25);
+ EXPECT_FLOAT_EQ(gdat[2], 0.25);
+ EXPECT_FLOAT_EQ(gdat[3], 0.25);
+ EXPECT_FLOAT_EQ(gdat[4], 0.25);
+ EXPECT_FLOAT_EQ(gdat[5], 0.25);
+ EXPECT_FLOAT_EQ(gdat[6], -0.75);
+ EXPECT_FLOAT_EQ(gdat[7], 0.25);
+ }
+ #endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_activation.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_activation.cc
index bed7715,940c6b9..1a619e7
--- a/test/singa/test_cudnn_activation.cc
+++ b/test/singa/test_cudnn_activation.cc
@@@ -46,8 -46,8 +46,8 @@@ TEST(TCudnnActivation, Setup)
TEST(TCudnnActivation, Forward) {
const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{n}, cuda);
in.CopyDataFromHostPtr<float>(x, n);
float neg_slope = 0.5f;
@@@ -66,9 -65,9 +65,8 @@@
singa::Tensor out = acti.Forward(singa::kTrain, in);
EXPECT_EQ(n, out.Size());
-- singa::CppCPU host(0, 1);
-- out.ToDevice(&host);
- const float* yptr = out.data<const float*>();
++ out.ToHost();
+ const float* yptr = out.data<float>();
float* y = new float[n];
if (acti.Mode() == "SIGMOID") {
for (size_t i = 0; i < n; i++) y[i] = 1.f / (1.f + exp(-x[i]));
@@@ -87,8 -86,8 +85,8 @@@
TEST(TCudnnActivation, Backward) {
const float x[] = {2.0f, 3.0f, 3.0f, 7.f, 0.0f, 5.0, 1.5, 2.5, -2.5, 1.5};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{n}, cuda);
in.CopyDataFromHostPtr<float>(x, n);
float neg_slope = 0.5f;
std::string types[] = {"SIGMOID", "TANH", "RELU"};
@@@ -101,22 -100,21 +99,20 @@@
singa::ReLUConf* reluconf = conf.mutable_relu_conf();
reluconf->set_negative_slope(neg_slope);
}
- acti.Setup(conf);
- acti.InitCudnn(n, singa::kFloat32);
+ acti.Setup(Shape{n}, conf);
singa::Tensor out = acti.Forward(singa::kTrain, in);
EXPECT_EQ(n, out.Size());
-- singa::CppCPU host(0, 1);
-- out.ToDevice(&host);
- const float* yptr = out.data<const float*>();
++ out.ToHost();
+ const float* yptr = out.data<float>();
const float grad[] = {2.0f, 1.0f, 2.0f, 0.0f, -2.0f,
-1.0, 1.5, 2.5, -1.5, -2.5};
-- singa::Tensor out_diff(singa::Shape{n}, &cuda);
++ singa::Tensor out_diff(singa::Shape{n}, cuda);
out_diff.CopyDataFromHostPtr<float>(grad, n);
const auto ret = acti.Backward(singa::kTrain, out_diff);
singa::Tensor in_diff = ret.first;
-- in_diff.ToDevice(&host);
- const float* xptr = in_diff.data<const float*>();
++ in_diff.ToHost();
+ const float* xptr = in_diff.data<float>();
float* dx = new float[n];
if (acti.Mode() == "SIGMOID") {
for (size_t i = 0; i < n; i++) dx[i] = grad[i] * yptr[i] * (1. - yptr[i]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_batchnorm.cc
index d38fdaa,b3b6477..7067b16
--- a/test/singa/test_cudnn_batchnorm.cc
+++ b/test/singa/test_cudnn_batchnorm.cc
@@@ -56,34 -53,31 +53,30 @@@ TEST(CudnnBatchNorm, Forward)
0.150676, 0.153442, -0.0929899, -0.148675,
-0.112459, -0.106284, -0.103074, -0.0668811
};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{1,2,4,4}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
in.CopyDataFromHostPtr(x, 1*2*4*4);
const float alpha_[] = {1, 1};
-- singa::Tensor alpha(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor alpha(singa::Shape{1,2,1,1}, cuda);
alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
const float beta_[] = {0, 0};
-- singa::Tensor beta(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor beta(singa::Shape{1,2,1,1}, cuda);
beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
singa::LayerConf conf;
singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
batchnorm_conf->set_factor(0.9);
- batchnorm_conf->set_channels(2);
- batchnorm_conf->set_height(4);
- batchnorm_conf->set_width(4);
- batchnorm.Setup(conf);
+ batchnorm.Setup(Shape{2, 4, 4}, conf);
-- batchnorm.ToDevice(&cuda);
++ batchnorm.ToDevice(cuda);
batchnorm.set_bnScale(alpha);
batchnorm.set_bnBias(beta);
batchnorm.set_runningMean(beta);
batchnorm.set_runningVariance(beta);
singa::Tensor out = batchnorm.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
out.ToHost();
- const float *outptr = out.data<const float *>();
+ const float *outptr = out.data<float>();
const auto & shape = out.shape();
EXPECT_EQ(4u, shape.size());
EXPECT_EQ(1u, shape[0]);
@@@ -136,8 -130,8 +129,8 @@@ TEST(CudnnBatchNorm, Backward)
0.150676, 0.153442, -0.0929899, -0.148675,
-0.112459, -0.106284, -0.103074, -0.0668811
};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor x_tensor(singa::Shape{1,2,4,4}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
singa::LayerConf conf;
@@@ -159,35 -150,35 +149,34 @@@
0.00468428, 0.00735506, -0.00682525, 0.00342023
};
-- singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, &cuda);
++ singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, cuda);
dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
const float alpha_[] = {1, 1};
-- singa::Tensor alpha(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor alpha(singa::Shape{1,2,1,1}, cuda);
alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
const float beta_[] = {0, 0};
-- singa::Tensor beta(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor beta(singa::Shape{1,2,1,1}, cuda);
beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
const float mean_[] = {0.0123405, -0.0622333};
-- singa::Tensor mean(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor mean(singa::Shape{1,2,1,1}, cuda);
mean.CopyDataFromHostPtr(mean_, 1*2*1*1);
const float var_[] = {15.9948, 8.68198};
-- singa::Tensor var(singa::Shape{1,2,1,1}, &cuda);
++ singa::Tensor var(singa::Shape{1,2,1,1}, cuda);
var.CopyDataFromHostPtr(var_, 1*2*1*1);
-- batchnorm.ToDevice(&cuda);
++ batchnorm.ToDevice(cuda);
batchnorm.set_bnScale(alpha);
batchnorm.set_bnBias(beta);
batchnorm.set_runningMean(beta);
batchnorm.set_runningVariance(beta);
batchnorm.Forward(singa::kTrain, x_tensor);
const auto ret = batchnorm.Backward(singa::kTrain, dy_tensor);
-- singa::CppCPU host(0, 1);
singa::Tensor dx = ret.first;
-- dx.ToDevice(&host);
- const float *dxptr = dx.data<const float *>();
++ dx.ToHost();
+ const float *dxptr = dx.data<float>();
const auto & shape = dx.shape();
EXPECT_EQ(4u, shape.size());
EXPECT_EQ(1u, shape[0]);
@@@ -228,8 -219,8 +217,8 @@@
EXPECT_NEAR(0.0217477, dxptr[31], 1e-4f);
singa::Tensor dbnScale = ret.second.at(0);
-- dbnScale.ToDevice(&host);
- const float *dbnScaleptr = dbnScale.data<const float *>();
++ dbnScale.ToHost();
+ const float *dbnScaleptr = dbnScale.data<float>();
const auto & dbnScaleShape = dbnScale.shape();
EXPECT_EQ(4u, dbnScaleShape.size());
EXPECT_EQ(1u, dbnScaleShape[0]);
@@@ -241,8 -232,8 +230,8 @@@
EXPECT_NEAR(-0.00219431f, dbnScaleptr[1], 1e-4f);
singa::Tensor dbnBias = ret.second.at(1);
-- dbnBias.ToDevice(&host);
- const float *dbnBiasptr = dbnBias.data<const float *>();
++ dbnBias.ToHost();
+ const float *dbnBiasptr = dbnBias.data<float>();
const auto & dbnBiasShape = dbnBias.shape();
EXPECT_EQ(4u, dbnBiasShape.size());
EXPECT_EQ(1u, dbnBiasShape[0]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_convolution.cc
index 2a17da2,44077b7..3b84645
--- a/test/singa/test_cudnn_convolution.cc
+++ b/test/singa/test_cudnn_convolution.cc
@@@ -65,18 -63,18 +63,18 @@@ TEST(CudnnConvolution, Forward)
const size_t batchsize = 1, c = 1, h = 3, w = 3;
const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * h * w);
// Set weight and bias manually
const size_t num_filters = 1;
const float we[num_filters * batchsize * h * w] = {
1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
-- singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
++ singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, cuda);
weight.CopyDataFromHostPtr(we, batchsize * h * w);
const float b[num_filters] = {1.0f};
-- singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++ singa::Tensor bias(singa::Shape{num_filters}, cuda);
bias.CopyDataFromHostPtr(b, num_filters);
CudnnConvolution conv;
conv.set_weight(weight);
@@@ -102,9 -97,9 +97,8 @@@
// Parameter "flag" does not influence convolution
singa::Tensor out1 = conv.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out1.ToDevice(&host);
- const float *outptr1 = out1.data<const float *>();
++ out1.ToHost();
+ const float *outptr1 = out1.data<float>();
// Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
EXPECT_EQ(4u, out1.Size());
@@@ -119,8 -114,8 +113,8 @@@ TEST(CudnnConvolution, Backward)
const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
// Set weight_ and bias_ manually
@@@ -128,10 -123,10 +122,10 @@@
const float we[num_filters * batchsize * src_h * src_w] = {
1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
-- &cuda);
++ cuda);
weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
const float b[num_filters] = {1.0f};
-- singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++ singa::Tensor bias(singa::Shape{num_filters}, cuda);
bias.CopyDataFromHostPtr(b, num_filters);
CudnnConvolution conv;
conv.set_weight(weight);
@@@ -162,14 -154,14 +153,13 @@@
const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
0.4f};
singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
-- &cuda);
++ cuda);
grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
const auto ret = conv.Backward(singa::kTrain, grad);
-- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
-- in_grad.ToDevice(&host);
- const float *dx = in_grad.data<const float *>();
++ in_grad.ToHost();
+ const float *dx = in_grad.data<float>();
const float *wptr = we;
EXPECT_EQ(9u, in_grad.Size());
EXPECT_EQ(dy[0] * wptr[4], dx[0]);
@@@ -186,12 -178,12 +176,12 @@@
singa::Tensor dw = ret.second[0];
singa::Tensor db = ret.second[1];
-- dw.ToDevice(&host);
-- db.ToDevice(&host);
- const float *dbptr = db.data<const float *>();
++ dw.ToHost();
++ db.ToHost();
+ const float *dbptr = db.data<float>();
EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
- const float *dwptr = dw.data<const float *>();
+ const float *dwptr = dw.data<float>();
EXPECT_EQ(9u, dw.Size());
EXPECT_EQ(dy[3] * x[4], dwptr[0]);
EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
@@@ -246,18 -235,18 +233,19 @@@ TEST(CudnnConvolution_AT, Forward)
const size_t batchsize = 1, c = 1, h = 3, w = 3;
const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * h * w);
// Set weight and bias manually
const size_t num_filters = 1;
const float we[num_filters * batchsize * h * w] = {
1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
-- singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
++ singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, cuda);
weight.CopyDataFromHostPtr(we, batchsize * h * w);
const float b[num_filters] = {1.0f};
-- singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++ singa::Tensor bias(singa::Shape{num_filters}, cuda);
bias.CopyDataFromHostPtr(b, num_filters);
CudnnConvolution conv;
conv.set_weight(weight);
@@@ -283,9 -269,9 +268,8 @@@
// Parameter "flag" does not influence convolution
singa::Tensor out1 = conv.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out1.ToDevice(&host);
- const float *outptr1 = out1.data<const float *>();
++ out1.ToHost();
+ const float *outptr1 = out1.data<float>();
// Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
EXPECT_EQ(4u, out1.Size());
@@@ -300,8 -286,8 +284,9 @@@ TEST(CudnnConvolution_AT, Backward)
const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
// Set weight_ and bias_ manually
@@@ -309,10 -295,10 +294,10 @@@
const float we[num_filters * batchsize * src_h * src_w] = {
1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
-- &cuda);
++ cuda);
weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
const float b[num_filters] = {1.0f};
-- singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++ singa::Tensor bias(singa::Shape{num_filters}, cuda);
bias.CopyDataFromHostPtr(b, num_filters);
CudnnConvolution conv;
conv.set_weight(weight);
@@@ -343,14 -326,14 +325,13 @@@
const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
0.4f};
singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
-- &cuda);
++ cuda);
grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
const auto ret = conv.Backward(singa::kTrain, grad);
-- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
-- in_grad.ToDevice(&host);
- const float *dx = in_grad.data<const float *>();
++ in_grad.ToHost();
+ const float *dx = in_grad.data<float>();
const float *wptr = we;
EXPECT_EQ(9u, in_grad.Size());
EXPECT_EQ(dy[0] * wptr[4], dx[0]);
@@@ -367,12 -350,12 +348,12 @@@
singa::Tensor dw = ret.second[0];
singa::Tensor db = ret.second[1];
-- dw.ToDevice(&host);
-- db.ToDevice(&host);
- const float *dbptr = db.data<const float *>();
++ dw.ToHost();
++ db.ToHost();
+ const float *dbptr = db.data<float>();
EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
- const float *dwptr = dw.data<const float *>();
+ const float *dwptr = dw.data<float>();
EXPECT_EQ(9u, dw.Size());
EXPECT_EQ(dy[3] * x[4], dwptr[0]);
EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_dropout.cc
index 32572d0,419dd0c..d06a254
--- a/test/singa/test_cudnn_dropout.cc
+++ b/test/singa/test_cudnn_dropout.cc
@@@ -48,8 -49,8 +49,8 @@@ TEST(CudnnDropout, Setup)
TEST(CudnnDropout, Forward) {
const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{n}, cuda);
in.CopyDataFromHostPtr(x, n);
float pdrop = 0.5;
@@@ -67,9 -68,9 +68,8 @@@
for (size_t i = 0; i < n; i++)
EXPECT_FLOAT_EQ(0, GetBitValue(mptr, i) * (GetBitValue(mptr, i) - 1));
-- singa::CppCPU host(0, 1);
-- out1.ToDevice(&host);
- const float* outptr1 = out1.data<const float*>();
++ out1.ToHost();
+ const float* outptr1 = out1.data<float>();
EXPECT_EQ(n, out1.Size());
float scale = 1.0f / (1.0f - pdrop);
// the output value should be 0 or the same as the input
@@@ -78,9 -79,9 +78,9 @@@
EXPECT_EQ(0.f, outptr1[7] * (outptr1[7] - scale * x[7]));
singa::Tensor out2 = drop.Forward(singa::kEval, in);
-- out2.ToDevice(&host);
++ out2.ToHost();
EXPECT_EQ(n, out2.Size());
- const float* outptr2 = out2.data<const float*>();
+ const float* outptr2 = out2.data<float>();
// the output value should be the same as the input
EXPECT_EQ(x[0], outptr2[0]);
EXPECT_EQ(x[1], outptr2[1]);
@@@ -90,8 -91,8 +90,8 @@@
TEST(CudnnDropout, Backward) {
const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
size_t n = sizeof(x) / sizeof(float);
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{n}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{n}, cuda);
in.CopyDataFromHostPtr(x, n);
float pdrop = 0.5;
@@@ -105,14 -106,14 +105,13 @@@
singa::Tensor out1 = drop.Forward(singa::kTrain, in);
const float dy[] = {4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f};
-- singa::Tensor grad(singa::Shape{n}, &cuda);
++ singa::Tensor grad(singa::Shape{n}, cuda);
grad.CopyDataFromHostPtr(dy, n);
const auto ret = drop.Backward(singa::kTrain, grad);
-- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
-- in_grad.ToDevice(&host);
- const float* dx = in_grad.data<const float*>();
++ in_grad.ToHost();
+ const float* dx = in_grad.data<float>();
singa::Tensor mask(drop.mask().shape(), drop.mask().data_type());
mask.CopyData(drop.mask());
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_lrn.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_lrn.cc
index 390c588,f7ec046..4ee0c54
--- a/test/singa/test_cudnn_lrn.cc
+++ b/test/singa/test_cudnn_lrn.cc
@@@ -58,8 -58,8 +58,8 @@@ TEST(CudnnLRN, Forward)
0.0597329, -0.0530868, 0.0124246, 0.108429,
0.0451175, 0.0247055, 0.0304345, 0.0179575
};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{1,2,4,4}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
in.CopyDataFromHostPtr(x, 1*2*4*4);
singa::LayerConf conf;
@@@ -68,12 -68,12 +68,11 @@@
lrn_conf->set_local_size(3);
lrn_conf->set_alpha(0.1);
lrn_conf->set_beta(0.75);
- lrn.Setup(conf);
+ lrn.Setup(Shape{2, 4, 4}, conf);
singa::Tensor out = lrn.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out.ToDevice(&host);
- const float *outptr = out.data<const float *>();
++ out.ToHost();
+ const float *outptr = out.data<float>();
const auto & shape = out.shape();
EXPECT_EQ(4u, shape.size());
EXPECT_EQ(1u, shape[0]);
@@@ -128,8 -128,8 +127,8 @@@ TEST(CudnnLRN, Backward)
0.0597329, -0.0530868, 0.0124246, 0.108429,
0.0451175, 0.0247055, 0.0304345, 0.0179575
};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor x_tensor(singa::Shape{1,2,4,4}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
const float dy[] = {
@@@ -143,7 -143,7 +142,7 @@@
0.177807, 0.000892812, -0.00113197, 0.00327798
};
-- singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, &cuda);
++ singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, cuda);
dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
singa::LayerConf conf;
@@@ -156,10 -156,10 +155,9 @@@
lrn.Forward(singa::kTrain, x_tensor);
const auto ret = lrn.Backward(singa::kTrain, dy_tensor);
-- singa::CppCPU host(0, 1);
singa::Tensor dx = ret.first;
-- dx.ToDevice(&host);
- const float *dxptr = dx.data<const float *>();
++ dx.ToHost();
+ const float *dxptr = dx.data<float>();
const auto & shape = dx.shape();
EXPECT_EQ(4u, shape.size());
EXPECT_EQ(1u, shape[0]);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_pooling.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_pooling.cc
index e66f212,2a98ab4..79051a3
--- a/test/singa/test_cudnn_pooling.cc
+++ b/test/singa/test_cudnn_pooling.cc
@@@ -58,8 -56,8 +56,8 @@@ TEST(CudnnPooling, Forward)
const size_t batchsize = 1, c = 1, h = 3, w = 3;
const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * h * w);
CudnnPooling pool;
@@@ -79,9 -74,9 +74,8 @@@
// Parameter "flag" does not influence pooling
singa::Tensor out1 = pool.Forward(singa::kTrain, in);
-- singa::CppCPU host(0, 1);
-- out1.ToDevice(&host);
- const float *outptr1 = out1.data<const float *>();
++ out1.ToHost();
+ const float *outptr1 = out1.data<float>();
// Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
EXPECT_EQ(4u, out1.Size());
EXPECT_EQ(5.0f, outptr1[0]);
@@@ -95,8 -90,8 +89,8 @@@ TEST(CudnnPooling, Backward)
const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
const float x[batchsize * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
6.0f, 7.0f, 8.0f, 9.0f};
-- singa::CudaGPU cuda(0, 1);
-- singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++ auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++ singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
CudnnPooling pool;
@@@ -119,14 -111,14 +110,13 @@@
// grad
const size_t grad_h = 2, grad_w = 2;
const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
-- singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, &cuda);
++ singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, cuda);
grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
const auto ret = pool.Backward(singa::kTrain, grad);
-- singa::CppCPU host(0, 1);
singa::Tensor in_grad = ret.first;
-- in_grad.ToDevice(&host);
- const float *dx = in_grad.data<const float *>();
++ in_grad.ToHost();
+ const float *dx = in_grad.data<float>();
EXPECT_EQ(9u, in_grad.Size());
EXPECT_EQ(0.0f, dx[0]);
EXPECT_EQ(0.0f, dx[1]);