You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@singa.apache.org by wa...@apache.org on 2016/06/24 06:51:34 UTC

[1/6] incubator-singa git commit: SINGA-175 Add memory management APIs and implement a subclass using CNMeM

Repository: incubator-singa
Updated Branches:
  refs/heads/dev 9abd7910d -> dd08f4130


SINGA-175 Add memory management APIs and implement a subclass using CNMeM

Add CNMem as a submodule in lib/


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/683b3a76
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/683b3a76
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/683b3a76

Branch: refs/heads/dev
Commit: 683b3a76ec1bf3d44f32cefdd86123efa8b9c188
Parents: 01aaf49
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Jun 10 15:02:13 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Fri Jun 10 15:02:13 2016 +0800

----------------------------------------------------------------------
 .gitmodules | 3 +++
 lib/cnmem   | 1 +
 2 files changed, 4 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/683b3a76/.gitmodules
----------------------------------------------------------------------
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..cd0a9d2
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "lib/cnmem"]
+	path = lib/cnmem
+	url = https://github.com/NVIDIA/cnmem.git

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/683b3a76/lib/cnmem
----------------------------------------------------------------------
diff --git a/lib/cnmem b/lib/cnmem
new file mode 160000
index 0000000..28a182d
--- /dev/null
+++ b/lib/cnmem
@@ -0,0 +1 @@
+Subproject commit 28a182d49529da49f4ac4e3941cec3edf16b3540

[3/6] incubator-singa git commit: SINGA-175 Add memory management APIs and implement a subclass using CNMeM

Posted by wa...@apache.org.

SINGA-175 Add memory management APIs and implement a subclass using CNMeM

Add base memory pool class.
Implement two subclasses, CnMemPool and CudaMemPool.
Add test for the memory pools.

TODO replace Device* to std::shared_ptr<Device> to avoid memory error because
the order of destructing device and tensor are dynamic (device may be freed
before tensors)


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/077d13e8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/077d13e8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/077d13e8

Branch: refs/heads/dev
Commit: 077d13e8052aa92679909b619966481a383a651f
Parents: ce3e6dc
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Wed Jun 22 20:26:41 2016 +0800
Committer: liyuchenmike@gmail.com <li...@gmail.com>
Committed: Wed Jun 22 20:26:41 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                 |   6 +-
 include/singa/core/device.h    |   5 ++
 include/singa/core/memory.h    |  46 +++++++++++++++
 include/singa/model/loss.h     |   2 +-
 src/core/device/cuda_gpu.cc    |  59 +++++++++++++++++--
 src/core/memory/memory.cc      |  69 ++++++++++++++++++++++
 src/proto/core.proto           |  13 +++++
 test/singa/test_memory.cc      | 111 ++++++++++++++++++++++++++++++++++++
 test/singa/test_mse.cc         |  13 ++++-
 test/singa/test_tensor_math.cc |   4 ++
 10 files changed, 319 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6240d2..c34b6ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,14 +13,15 @@ ENDIF()
 
 #message(STATUS "${CMAKE_CXX_FLAGS}")
 SET(SINGA_INCLUDE_DIR
-    "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${PROJECT_BINARY_DIR}")
+    #"${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${CMAKE_SOURCE_DIR}/lib/cnmen/include;${PROJECT_BINARY_DIR}")
+    "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/include;${PROJECT_BINARY_DIR}")
 #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
 INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
 
 #OPTION(CPU_ONLY "use GPU libs" OFF)
 OPTION(USE_CBLAS "Use CBlas libs" ON)
 OPTION(USE_CUDA "Use Cuda libs" ON)
-OPTION(USE_CUDNN "Use Cudnn libs" ON)
+OPTION(USE_CUDNN "Use Cudnn libs" OFF)
 OPTION(USE_OPENCV "Use opencv" OFF)
 OPTION(USE_LMDB "Use LMDB libs" OFF)
 
@@ -38,5 +39,6 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 
 ADD_SUBDIRECTORY(lib/cnmem)
+LIST(APPEND SINGA_LINKER_LIBS cnmem)
 ADD_SUBDIRECTORY(src)
 ADD_SUBDIRECTORY(test)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 8c95dc7..fc98a23 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -147,6 +147,8 @@ class CudaGPU : public Device {
   ~CudaGPU();
   CudaGPU(int id = 0, int num_executors = 1, string scheduler = "sync",
          string vm = "gc-only");
+	CudaGPU(const MemPoolConf& mem_conf, 
+					int id = 0, int num_executors = 1, string scheduler = "sync");
 
   void SetRandSeed(unsigned seed) override;
   static void DeviceQuery();
@@ -180,6 +182,9 @@ class CudaGPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
+	
+	private:
+	DeviceMemPool* pool;
 };
 
 /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/core/memory.h
----------------------------------------------------------------------
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index db09043..e4e1e63 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -19,10 +19,56 @@
 #ifndef SINGA_CORE_MEMORY_H_
 #define SINGA_CORE_MEMORY_H_
 
+#include "cnmem.h"
+#include <mutex>
+
 namespace singa {
 
 /// Manage device memory pool including garbage collection, memory opt.
 class VirtualMemory {};
 
+class DeviceMemPool {
+	public:
+	virtual void InitPool() = 0;
+	virtual void Malloc(void** ptr, const size_t size) = 0;
+	virtual void Free(void* ptr) = 0;
+	virtual ~DeviceMemPool(){};
+};
+
+class CnMemPool : public DeviceMemPool {
+	public:
+	int status = 1;
+
+	void InitPool();
+
+	/// numDevices: total number of available GPU cards.
+	/// initSize: all devices will be allocated with this size 
+	/// manager_flags: pool manager flag (one for all devices)
+	/// flag = 0; default flag
+	/// flag = 1: Prevent the manager from growing its memory consumption
+	/// flag = 2; Prevent the manager from stealing memory.
+	void InitPool(int numDevices, size_t initSize, unsigned flag);
+
+	void Malloc(void** ptr, const size_t size);
+	void Free(void* ptr);
+
+	// release all memory and set cnmem manager to unintialized 
+	~CnMemPool();
+
+	private:
+	// whether the (global) memory pool has been initialized
+	static bool initialized;
+	// lock on the initialized variable
+	static std::mutex mtx;
+};
+
+class CudaMemPool : public DeviceMemPool {
+	public:
+	void InitPool(){};
+	void Malloc(void** ptr, const size_t size);
+	void Free(void* ptr);
+	~CudaMemPool(){};
+};
+
 }  // namespace singa
 #endif  // SINGA_CORE_MEMORY_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
index 6a23067..dcf0da4 100644
--- a/include/singa/model/loss.h
+++ b/include/singa/model/loss.h
@@ -35,7 +35,7 @@ class Loss {
     loss.ParseFromString(conf);
     Setup(loss);
   }
-
+	virtual ~Loss(){};
   /// Set meta fields from user configurations.
   virtual void Setup(const LossConf& conf) {}
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index a47f6fe..d9a0985 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -22,7 +22,7 @@
 #include <cuda_runtime.h>
 #include <curand.h>
 #include <chrono>
-
+#include <iostream>
 #include "singa/core/device.h"
 #include "singa/utils/cuda_utils.h"
 namespace singa {
@@ -42,6 +42,8 @@ CudaGPU::~CudaGPU() {
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
   }
 #endif
+	delete pool;
+	LOG(INFO) << "device has been deleted";
 }
 
 CudaGPU::CudaGPU(int id, int num_executors,
@@ -67,6 +69,48 @@ CudaGPU::CudaGPU(int id, int num_executors,
   auto status = cudnnCreate(&ctx_.cudnn_handle);
   CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
 #endif  // USE_CUDNN
+	
+	// initialize cnmem memory management as default
+	pool = new CnMemPool();
+	((CnMemPool*)pool)->InitPool();
+}
+
+CudaGPU::CudaGPU(const MemPoolConf& mem_conf,int id, int num_executors,
+                       string scheduler)
+    : Device(id, num_executors, scheduler, "gc-only") {
+  if (id == -1)
+    id = FindDevice(0);
+  lang_ = kCuda;
+  ctx_.stream = NULL;  // use the default sync stream
+  // TODO(wangwei) create one handle for each steam?
+  CUDA_CHECK(cudaSetDevice(FindDevice(0)));
+  // use curandCreateGeneratorHost for CudaHost device
+  CURAND_CHECK(
+      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  SetRandSeed(seed);
+  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
+
+#ifdef USE_CUDNN
+  // TODO(wangwei) create one handle for each stream?
+  auto status = cudnnCreate(&ctx_.cudnn_handle);
+  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif  // USE_CUDNN
+
+	// initialize memory management for cuda devices
+	string memoryPoolType = mem_conf.type();
+	if(memoryPoolType.compare("cnmem") == 0) {
+		pool = new CnMemPool();
+		int num_devices = mem_conf.num_devices();
+		size_t alloc_size = mem_conf.alloc_size();
+		unsigned flag = mem_conf.cnmemflag();
+		((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
+	}
+	else {
+		pool = new CudaMemPool();
+	}
 }
 
 void CudaGPU::SetRandSeed(unsigned seed) {
@@ -90,7 +134,8 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
 void* CudaGPU::Malloc(int size) {
   void* ptr = nullptr;
   if (size > 0) {
-    CUDA_CHECK(cudaMalloc(&ptr, size));
+		//CUDA_CHECK(cudaMalloc((void**)&ptr,size));
+		pool->Malloc((void**)&ptr,size);
     CUDA_CHECK(cudaMemset(ptr, 0, size));
   }
   return ptr;
@@ -98,8 +143,14 @@ void* CudaGPU::Malloc(int size) {
 
   /// Free cpu memory.
 void CudaGPU::Free(void* ptr) {
-  if (ptr != nullptr)
-    CUDA_CHECK(cudaFree(ptr));
+	LOG(INFO) << "Cuda free is called";
+	LOG(INFO) << "pool pointer" << pool << "\n";
+	LOG(INFO) << "pool status:" << ((CnMemPool*)pool)->status;
+  if (ptr != nullptr) {
+		//CUDA_CHECK(cudaFree(ptr));
+		pool->Free(ptr);
+	}
+	LOG(INFO) << "free memory is successed";
 }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index a1cf5db..c5878a6 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -18,3 +18,72 @@
 
 
 #include "singa/core/memory.h"
+#include "singa/utils/logging.h"
+#include <iostream>
+
+namespace singa {
+
+bool singa::CnMemPool::initialized = false;
+std::mutex singa::CnMemPool::mtx;
+
+void CnMemPool::InitPool(int numDevices, size_t initSize, unsigned flag) {
+	mtx.lock();
+	if(!initialized) {
+		CHECK_GE(numDevices, 1);
+		cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
+		for(int i = 0; i < numDevices; i++) {
+			settingPtr[i].device = i;
+			settingPtr[i].size = initSize;
+			settingPtr[i].numStreams = 0;
+			settingPtr[i].streams = NULL;
+			settingPtr[i].streamSizes = 0;
+		}
+		cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
+		CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+		delete[] settingPtr;
+		initialized = true;
+	}
+	mtx.unlock();
+}
+
+void CnMemPool::InitPool() {
+		int defaultNumDevices = 1;
+		size_t defaultSize = 1000000U;
+		InitPool(defaultNumDevices,defaultSize,cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
+}
+
+CnMemPool::~CnMemPool() {
+	mtx.lock();
+	if(initialized) {
+		cnmemStatus_t status = cnmemFinalize();
+		CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+		initialized = false;
+	}
+	mtx.unlock();
+	LOG(INFO) << "cnmem has been freed";
+}
+
+
+void CnMemPool::Malloc(void** ptr, const size_t size) {
+	cnmemStatus_t status = cnmemMalloc(ptr,size,NULL);
+	CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+}
+
+void CnMemPool::Free(void* ptr) {
+	LOG(INFO) << "cnmem free is called !!!!!!!!!!!";
+	cnmemStatus_t status = cnmemFree(ptr,NULL);
+	CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+	LOG(INFO) << "cnmem free is terminated";
+}
+
+void CudaMemPool::Malloc(void** ptr, const size_t size) {
+	cudaError_t status = cudaMalloc(ptr,size);
+	CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+void CudaMemPool::Free(void* ptr) {
+	cudaError_t status = cudaFree(ptr);
+	CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index 88d7f12..cf6e193 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -44,3 +44,16 @@ enum CopyDirection {
   kDeviceToDevice = 3;
   kNumDirection = 4;
 }
+
+// configuration for device memory pool
+message MemPoolConf {
+	optional string type = 1 [default = "cnmem"];
+	optional uint32 num_devices = 2 [default = 1];
+	// allocation size for each device
+	optional uint32 alloc_size = 3 [default = 10000000];
+	// memory manager flag for cnmem
+	// cnmemflag = 0: default flag
+	// cnmemflag = 1: prevent the manager from growing its memory consumption
+	// cnmemflag = 2: prevent the manager from stealing memory
+	optional uint32 cnmemflag = 4 [default = 0];
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
new file mode 100644
index 0000000..f5e464d
--- /dev/null
+++ b/test/singa/test_memory.cc
@@ -0,0 +1,111 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* 
+*   http://www.apache.org/licenses/LICENSE-2.0
+* 
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/utils/logging.h"
+#include "singa/core/memory.h"
+#include "singa/singa_config.h"
+#include <sys/time.h>
+
+#ifdef USE_CUDA
+TEST(CnmemPool, PoolInit) {
+	singa::CnMemPool pool;
+	pool.InitPool();
+}
+
+TEST(CnmemPool, PoolInitAll) {
+	singa::CnMemPool pool;
+	int nDevices;
+	cudaGetDeviceCount(&nDevices);
+	CHECK_GE(nDevices,1);
+	pool.InitPool(nDevices,1000000U,0);
+}
+
+TEST(CnmemPool, UsePool) {
+	singa::CnMemPool pool;
+	pool.InitPool();
+	int numOfTests = 10;
+	int numOfWriteVsRead = 3;
+	int allocSize = 1000000U;
+	for(int i = 0; i < numOfTests; i++) {
+		int** memPtrs = new int*[numOfWriteVsRead];
+		for(int j = 0; j < numOfWriteVsRead; j++) {
+			pool.Malloc((void**)(&memPtrs[j]), allocSize); 
+		}
+		pool.Free(memPtrs[0]);
+		delete[] memPtrs;
+	}
+}
+
+TEST(CudaMemPool, UsePool) {
+	singa::CudaMemPool pool;
+	int numOfTests = 10;
+	int numOfWriteVsRead = 3;
+	int allocSize = 1000000U;
+	for(int i = 0; i < numOfTests; i++) {
+		int** memPtrs = new int*[numOfWriteVsRead];
+		for(int j = 0; j < numOfWriteVsRead; j++) {
+			pool.Malloc((void**)(&memPtrs[j]), allocSize); 
+		}
+		pool.Free(memPtrs[0]);
+		delete[] memPtrs;
+	}
+}
+
+TEST(MemPool, CompareCudaCnmem) {
+	singa::CudaMemPool cudaPool;
+	singa::CnMemPool cnPool;
+	cnPool.InitPool();
+
+	int numOfTests = 10000;
+	int allocSize = 1000000U;
+	struct timeval start,end;
+	double t1,t2;
+
+	singa::DeviceMemPool* pool = NULL;
+	pool = &cnPool;
+	
+	gettimeofday(&start,NULL);
+	for(int i = 0; i < numOfTests; i++) {
+		int* memPtrs = NULL;
+		pool->Malloc((void**)&memPtrs, allocSize); 	
+		pool->Free(memPtrs);
+	}
+	gettimeofday(&end,NULL);
+	
+	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
+	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
+	LOG(INFO) << "cnmem time: " << t2-t1 << " ms" << std::endl;
+
+	pool = &cudaPool;
+	gettimeofday(&start,NULL);
+	for(int i = 0; i < numOfTests; i++) {
+		int* memPtrs = NULL;
+		pool->Malloc((void**)&memPtrs, allocSize); 
+		pool->Free(memPtrs);
+	}
+	gettimeofday(&end,NULL);
+	
+	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
+	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
+	LOG(INFO) << "cuda time: " << t2-t1 << " ms" << std::endl;
+}
+#endif // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 2c02273..7c6066e 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -68,11 +68,11 @@ TEST_F(TestMSE, CppBackward) {
 #endif
 #ifdef USE_CUDA
 TEST_F(TestMSE, CudaForward) {
-  singa::MSE mse;
+  singa::MSE* mse = new singa::MSE();
   singa::CudaGPU dev;
   p.ToDevice(&dev);
   t.ToDevice(&dev);
-  Tensor loss = mse.Forward(p, t);
+  Tensor loss = mse->Forward(p, t);
 
   loss.ToHost();
   auto ldat = loss.data<const float*>();
@@ -85,6 +85,12 @@ TEST_F(TestMSE, CudaForward) {
     }
     EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
   }
+	LOG(INFO) << "Before delete pxxxxxxxxxxxxxxxxxxxxxxxx";
+	p.ToHost();
+	LOG(INFO) << "Before delete tyyyyyyyyyyyyyyyyyyyyyyy";
+	t.ToHost();
+	LOG(INFO) << "terminate-xxxxxxxxxxxxxxxxxx-";
+	delete mse;
 }
 TEST_F(TestMSE, CudaBackward) {
   singa::MSE mse;
@@ -98,5 +104,8 @@ TEST_F(TestMSE, CudaBackward) {
 
   for (size_t i = 0; i < grad.Size(); i++)
     EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
+	p.ToHost();
+	t.ToHost();
+
 }
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 170b96c..b18e465 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -302,6 +302,8 @@ TEST_F(TestTensorMath, MultCuda) {
       EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
     }
   }
+	d.ToHost();
+	p.ToHost();
 }
 
 TEST_F(TestTensorMath, AddColumnCuda) {
@@ -479,6 +481,7 @@ TEST_F(TestTensorMath, SumRowsCuda) {
     }
     EXPECT_FLOAT_EQ(tptr[i], tmp);
   }
+	d.ToHost();
 }
 TEST_F(TestTensorMath, SumColumnCuda) {
   singa::CudaGPU dev;
@@ -495,5 +498,6 @@ TEST_F(TestTensorMath, SumColumnCuda) {
     }
     EXPECT_FLOAT_EQ(tptr[i], tmp);
   }
+	d.ToHost();
 }
 #endif

[4/6] incubator-singa git commit: changed all device pointer to shared pointer

Posted by wa...@apache.org.

changed all device pointer to shared pointer


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5651383f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5651383f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5651383f

Branch: refs/heads/dev
Commit: 5651383f5dbe0ab17eeda70f491d837a24bcb4ab
Parents: 077d13e
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Wed Jun 22 21:06:38 2016 +0800
Committer: liyuchenmike@gmail.com <li...@gmail.com>
Committed: Wed Jun 22 21:06:38 2016 +0800

----------------------------------------------------------------------
 include/singa/core/device.h    |  7 ++--
 include/singa/core/tensor.h    | 10 ++---
 include/singa/model/layer.h    |  2 +-
 src/core/device/cpp_cpu.cc     |  2 +-
 src/core/device/cuda_gpu.cc    |  5 ---
 src/core/device/device.cc      |  2 +-
 src/core/memory/memory.cc      |  3 --
 src/core/tensor/tensor.cc      | 19 +++++-----
 src/model/layer/batchnorm.cc   |  2 +-
 src/model/layer/batchnorm.h    |  2 +-
 src/model/layer/dense.cc       |  2 +-
 src/model/layer/dense.h        |  2 +-
 src/model/layer/dropout.cc     |  2 +-
 src/model/layer/dropout.h      |  2 +-
 test/singa/test_dense.cc       | 33 +++++++----------
 test/singa/test_memory.cc      |  6 +--
 test/singa/test_mse.cc         | 17 ++++-----
 test/singa/test_sgd.cc         |  8 ++--
 test/singa/test_tensor.cc      |  6 +--
 test/singa/test_tensor_math.cc | 74 ++++++++++++++++++-------------------
 20 files changed, 94 insertions(+), 112 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index fc98a23..d2b5b12 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -23,6 +23,7 @@
 #include <vector>
 #include <string>
 #include <functional>
+#include <memory>
 #include "singa/singa_config.h"
 #include "singa/core/common.h"
 #include "singa/core/memory.h"
@@ -75,7 +76,7 @@ class Device {
     return lang_;
   }
 
-  Device* host() const { return host_;}
+  std::shared_ptr<Device> host() const { return host_;}
 
   Context* context(int k) {
     return &ctx_;
@@ -107,7 +108,7 @@ class Device {
   // SafeQueue<Operation> op_queue_;
   // SafeQueue<Operation> op_log_;
   /// The host device
-  Device* host_;
+  std::shared_ptr<Device> host_;
   // TODO(wangwei) define multiple contexts, one per executor
   Context ctx_;
 };
@@ -134,7 +135,7 @@ class CppCPU : public Device {
 };
 
 /// a singleton CppDevice as the host for all devices.
-extern CppCPU defaultDevice;
+extern std::shared_ptr<Device> defaultDevice;
 
 
 // Implement Device using OpenCL libs.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..8f73047 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -67,8 +67,8 @@ class Tensor {
   Tensor();
   explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
   explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
-  Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
-  Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
+  Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
+  Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor &from);
@@ -80,7 +80,7 @@ class Tensor {
   /// blob_ is allocated in constructors.
   Blob *blob() const { return blob_; }
 
-  Device *device() const { return device_; }
+  std::shared_ptr<Device> device() const { return device_; }
 
   /// Return immutable Tensor values with given type.
   template <typename DType>
@@ -125,7 +125,7 @@ class Tensor {
 
   /// Reset the device.
   /// If the target device is a diff device, then do deep data copy.
-  void ToDevice(Device *dev);
+  void ToDevice(std::shared_ptr<Device> dev);
 
   /// Equivalent to ToDevice(host_dev).
   void ToHost();
@@ -192,7 +192,7 @@ class Tensor {
  protected:
   bool transpose_ = false;
   DataType data_type_ = kFloat32;
-  Device *device_ = nullptr;
+  std::shared_ptr<Device> device_ = nullptr;
   /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
   /// If you want to get an allocated Blob, use blob() instead of blob_.
   Blob *blob_ = nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 82c8edc..ee2b42b 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -125,7 +125,7 @@ class Layer {
 
   /// Move the layer (including its parameters and other internal Tensor) onto
   /// the given device
-  virtual void ToDevice(Device* device) {
+  virtual void ToDevice(std::shared_ptr<Device> device) {
     //for (auto p : param_values_) p->ToDevice(device);
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
index 44f614a..6884e35 100644
--- a/src/core/device/cpp_cpu.cc
+++ b/src/core/device/cpp_cpu.cc
@@ -17,7 +17,7 @@
  */
 #include "singa/core/device.h"
 namespace singa {
-CppCPU defaultDevice(-1, 1);
+std::shared_ptr<Device> defaultDevice=std::make_shared<CppCPU>(-1, 1);
 CppCPU::CppCPU(int id, int num_executors, string scheduler,
          string vm) : Device(id, num_executors, scheduler, vm) {
   lang_ = kCpp;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index d9a0985..4da292f 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -43,7 +43,6 @@ CudaGPU::~CudaGPU() {
   }
 #endif
 	delete pool;
-	LOG(INFO) << "device has been deleted";
 }
 
 CudaGPU::CudaGPU(int id, int num_executors,
@@ -143,14 +142,10 @@ void* CudaGPU::Malloc(int size) {
 
   /// Free cpu memory.
 void CudaGPU::Free(void* ptr) {
-	LOG(INFO) << "Cuda free is called";
-	LOG(INFO) << "pool pointer" << pool << "\n";
-	LOG(INFO) << "pool status:" << ((CnMemPool*)pool)->status;
   if (ptr != nullptr) {
 		//CUDA_CHECK(cudaFree(ptr));
 		pool->Free(ptr);
 	}
-	LOG(INFO) << "free memory is successed";
 }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 1d3c446..1889339 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -22,7 +22,7 @@ namespace singa {
 Device::Device(int id, int num_executors, string scheduler, string vm)
     : id_(id), num_executors_(num_executors) {
       // TODO(wangwei) create scheduler and vm.
-  host_ = &defaultDevice;
+  host_ = defaultDevice;
 }
 
 void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index c5878a6..304c101 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -60,7 +60,6 @@ CnMemPool::~CnMemPool() {
 		initialized = false;
 	}
 	mtx.unlock();
-	LOG(INFO) << "cnmem has been freed";
 }
 
 
@@ -70,10 +69,8 @@ void CnMemPool::Malloc(void** ptr, const size_t size) {
 }
 
 void CnMemPool::Free(void* ptr) {
-	LOG(INFO) << "cnmem free is called !!!!!!!!!!!";
 	cnmemStatus_t status = cnmemFree(ptr,NULL);
 	CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
-	LOG(INFO) << "cnmem free is terminated";
 }
 
 void CudaMemPool::Malloc(void** ptr, const size_t size) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..a5b43d8 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -25,29 +25,28 @@
 namespace singa {
 
 Tensor::~Tensor() {
-  // LOG(ERROR) << "~";
   if (blob_ != nullptr && blob_->DecRefCount() == 0)
     device_->FreeBlob(blob_);
   blob_ = nullptr;
 }
 
-Tensor::Tensor() { device_ = &defaultDevice; }
+Tensor::Tensor() { device_ = defaultDevice; }
 
 Tensor::Tensor(const Shape &shape, DataType dtype)
-    : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
-  device_ = &defaultDevice;
+    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+  device_ = defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
 Tensor::Tensor(Shape &&shape, DataType dtype)
-    : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
-  device_ = &defaultDevice;
+    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+  device_ = defaultDevice;
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
+Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
+Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
 }
@@ -104,7 +103,7 @@ void Tensor::AsType(DataType type) {
   }
 }
 
-void Tensor::ToDevice(Device *dst) {
+void Tensor::ToDevice(std::shared_ptr<Device> dst) {
   // TODO(wangwei) the comparison is very strict. May compare against device ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
@@ -234,7 +233,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
   CHECK_GE(src.MemSize(), src_offset + nBytes);
   CHECK_GE(dst->MemSize(), dst_offset + nBytes);
 
-  Device *src_dev = src.device(), *dst_dev = dst->device();
+  std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
   Blob *from = src.blob(), *to = dst->blob();
   if (dst_dev->lang() != src_dev->lang()) {
     // let the none cpp device conduct copy op

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/batchnorm.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.cc b/src/model/layer/batchnorm.cc
index bcd0870..1e6c39b 100644
--- a/src/model/layer/batchnorm.cc
+++ b/src/model/layer/batchnorm.cc
@@ -44,7 +44,7 @@ void BatchNorm::Setup(const LayerConf& conf) {
   param_values_.push_back(&runningVariance_);
 }
 
-void BatchNorm::ToDevice(Device* device) {
+void BatchNorm::ToDevice(std::shared_ptr<Device> device) {
   bnScale_.ToDevice(device);
   bnBias_.ToDevice(device);
   dbnScale_.ToDevice(device);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.h b/src/model/layer/batchnorm.h
index 0255179..83f143d 100644
--- a/src/model/layer/batchnorm.h
+++ b/src/model/layer/batchnorm.h
@@ -67,7 +67,7 @@ class BatchNorm : public Layer {
     runningVariance_.ResetLike(x);
     runningVariance_.CopyData(x);
   }
-  virtual void ToDevice(Device* device) override;
+  virtual void ToDevice(std::shared_ptr<Device> device) override;
 
  protected:
   float factor_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index b349787..d47c1db 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -79,7 +79,7 @@ const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag,
   return std::make_pair(dx, param_grad);
 }
 
-void Dense::ToDevice(Device *device) {
+void Dense::ToDevice(std::shared_ptr<Device> device) {
   weight_.ToDevice(device);
   bias_.ToDevice(device);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dense.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.h b/src/model/layer/dense.h
index a5a6f66..49cb986 100644
--- a/src/model/layer/dense.h
+++ b/src/model/layer/dense.h
@@ -40,7 +40,7 @@ class Dense : public Layer {
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                    const Tensor& grad) override;
 
-  void ToDevice(Device* device) override;
+  void ToDevice(std::shared_ptr<Device> device) override;
 
   size_t num_output() const { return hdim_; }
   size_t num_input() const { return vdim_; }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.cc b/src/model/layer/dropout.cc
index c2c97be..695008e 100644
--- a/src/model/layer/dropout.cc
+++ b/src/model/layer/dropout.cc
@@ -52,7 +52,7 @@ const std::pair<Tensor, vector<Tensor>> Dropout::Backward(int flag,
   return std::make_pair(input_grad, param_grad);
 }
 
-void Dropout::ToDevice(Device* device) {
+void Dropout::ToDevice(std::shared_ptr<Device> device) {
   Layer::ToDevice(device);
   mask_.ToDevice(device);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/src/model/layer/dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.h b/src/model/layer/dropout.h
index 5efaf6a..d5da79c 100644
--- a/src/model/layer/dropout.h
+++ b/src/model/layer/dropout.h
@@ -43,7 +43,7 @@ class Dropout : public Layer {
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                    const Tensor& grad) override;
 
-  void ToDevice(Device* device) override;
+  void ToDevice(std::shared_ptr<Device> device) override;
 
   float dropout_ratio() const {
     return dropout_ratio_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index 052d0e8..7ed4d33 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -66,7 +66,6 @@ TEST(Dense, ForwardCpp) {
   dense.set_bias(bias);
 
   singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-  singa::CppCPU host(0, 1);
   const float *outptr1 = out1.data<const float *>();
   EXPECT_EQ(9u, out1.Size());
   for (int i = 0; i < 3; i++)
@@ -76,7 +75,6 @@ TEST(Dense, ForwardCpp) {
                       outptr1[i * 3 + j]);
 }
 #endif  // USE_CBLAS
-#ifdef USE_CUDA
 TEST(Dense, BackwardCpp) {
   Dense dense;
 
@@ -89,7 +87,6 @@ TEST(Dense, BackwardCpp) {
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
   const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  singa::CudaGPU cuda(0, 1);
   singa::Tensor in(singa::Shape{batchsize, vdim});
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
@@ -114,7 +111,6 @@ TEST(Dense, BackwardCpp) {
   grad.CopyDataFromHostPtr(dy, batchsize * hdim);
 
   const auto ret = dense.Backward(singa::kTrain, grad);
-  singa::CppCPU host(0, 1);
   singa::Tensor in_grad = ret.first;
   singa::Tensor dweight = ret.second.at(0);
   singa::Tensor dbias = ret.second.at(1);
@@ -139,7 +135,6 @@ TEST(Dense, BackwardCpp) {
   for (int i = 0; i < 3; i++)
     EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
 }
-#endif
 
 #ifdef USE_CUDA
 TEST(Dense, ForwardCuda) {
@@ -154,25 +149,24 @@ TEST(Dense, ForwardCuda) {
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
   const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  singa::CudaGPU cuda(0, 1);
-  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
   // set weight
   const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+  singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
   weight.CopyDataFromHostPtr(we, hdim * vdim);
 
   const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-  singa::Tensor bias(singa::Shape{hdim}, &cuda);
+  singa::Tensor bias(singa::Shape{hdim}, cuda);
   bias.CopyDataFromHostPtr(bia, hdim);
 
   dense.set_weight(weight);
   dense.set_bias(bias);
 
   singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-  singa::CppCPU host(0, 1);
-  out1.ToDevice(&host);
+  out1.ToHost();
   const float *outptr1 = out1.data<const float *>();
   EXPECT_EQ(9u, out1.Size());
   for (int i = 0; i < 3; i++)
@@ -193,17 +187,17 @@ TEST(Dense, BackwardCuda) {
 
   const size_t batchsize = 3, vdim = 2, hdim = 3;
   const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  singa::CudaGPU cuda(0, 1);
-  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
+  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
   in.CopyDataFromHostPtr(x, batchsize * vdim);
 
   // set weight
   const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
+  singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
   weight.CopyDataFromHostPtr(we, hdim * vdim);
 
   const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-  singa::Tensor bias(singa::Shape{hdim}, &cuda);
+  singa::Tensor bias(singa::Shape{hdim}, cuda);
   bias.CopyDataFromHostPtr(bia, hdim);
 
   dense.set_weight(weight);
@@ -214,15 +208,14 @@ TEST(Dense, BackwardCuda) {
   // grad
   const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
                                       2.0f, 3.0f, 3.0f, 3.0f};
-  singa::Tensor grad(singa::Shape{batchsize, hdim}, &cuda);
+  singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
   grad.CopyDataFromHostPtr(dy, batchsize * hdim);
 
   const auto ret = dense.Backward(singa::kTrain, grad);
-  singa::CppCPU host(0, 1);
   singa::Tensor in_grad = ret.first;
   singa::Tensor dweight = ret.second.at(0);
   singa::Tensor dbias = ret.second.at(1);
-  in_grad.ToDevice(&host);
+  in_grad.ToHost();
   const float *dx = in_grad.data<const float *>();
   EXPECT_EQ(6u, in_grad.Size());
   for (int i = 0; i < 3; i++)
@@ -231,7 +224,7 @@ TEST(Dense, BackwardCuda) {
           (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
            dy[i * 3 + 2] * we[2 * 2 + j]),
           dx[i * 2 + j]);
-  dweight.ToDevice(&host);
+  dweight.ToHost();
   const float *dweightx = dweight.data<const float *>();
   EXPECT_EQ(6u, dweight.Size());
   for (int i = 0; i < 3; i++)
@@ -240,7 +233,7 @@ TEST(Dense, BackwardCuda) {
           (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
            dy[2 * 3 + i] * x[2 * 2 + j]),
           dweightx[i * 2 + j]);
-  dbias.ToDevice(&host);
+  dbias.ToHost();
   const float *dbiasx = dbias.data<const float *>();
   EXPECT_EQ(3u, dbias.Size());
   for (int i = 0; i < 3; i++)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
index f5e464d..90fc99a 100644
--- a/test/singa/test_memory.cc
+++ b/test/singa/test_memory.cc
@@ -75,7 +75,7 @@ TEST(MemPool, CompareCudaCnmem) {
 	singa::CnMemPool cnPool;
 	cnPool.InitPool();
 
-	int numOfTests = 10000;
+	int numOfTests = 5000;
 	int allocSize = 1000000U;
 	struct timeval start,end;
 	double t1,t2;
@@ -93,7 +93,7 @@ TEST(MemPool, CompareCudaCnmem) {
 	
 	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
 	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
-	LOG(INFO) << "cnmem time: " << t2-t1 << " ms" << std::endl;
+	LOG(INFO) << "cnmem memory time: " << t2-t1 << " ms" << std::endl;
 
 	pool = &cudaPool;
 	gettimeofday(&start,NULL);
@@ -106,6 +106,6 @@ TEST(MemPool, CompareCudaCnmem) {
 	
 	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
 	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
-	LOG(INFO) << "cuda time: " << t2-t1 << " ms" << std::endl;
+	LOG(INFO) << "cuda memory time: " << t2-t1 << " ms" << std::endl;
 }
 #endif // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 7c6066e..d2c5125 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -69,9 +69,9 @@ TEST_F(TestMSE, CppBackward) {
 #ifdef USE_CUDA
 TEST_F(TestMSE, CudaForward) {
   singa::MSE* mse = new singa::MSE();
-  singa::CudaGPU dev;
-  p.ToDevice(&dev);
-  t.ToDevice(&dev);
+  auto dev = std::make_shared<singa::CudaGPU>();
+  p.ToDevice(dev);
+  t.ToDevice(dev);
   Tensor loss = mse->Forward(p, t);
 
   loss.ToHost();
@@ -85,18 +85,15 @@ TEST_F(TestMSE, CudaForward) {
     }
     EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
   }
-	LOG(INFO) << "Before delete pxxxxxxxxxxxxxxxxxxxxxxxx";
 	p.ToHost();
-	LOG(INFO) << "Before delete tyyyyyyyyyyyyyyyyyyyyyyy";
 	t.ToHost();
-	LOG(INFO) << "terminate-xxxxxxxxxxxxxxxxxx-";
-	delete mse;
 }
+
 TEST_F(TestMSE, CudaBackward) {
   singa::MSE mse;
-  singa::CudaGPU dev;
-  p.ToDevice(&dev);
-  t.ToDevice(&dev);
+  auto dev = std::make_shared<singa::CudaGPU>();
+  p.ToDevice(dev);
+  t.ToDevice(dev);
   mse.Forward(p, t);
   Tensor grad = mse.Backward();
   grad.ToHost();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_sgd.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
index 71ab15e..3b04ab6 100644
--- a/test/singa/test_sgd.cc
+++ b/test/singa/test_sgd.cc
@@ -88,8 +88,8 @@ TEST(SGD, ApplyWithoutMomentumCuda) {
   const float v[4] = {0.1, 0.2, 0.3, 0.4};
   const float g[4] = {0.1, 0.1, 0.1, 0.1};
 
-  singa::CudaGPU dev;
-  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
   value.CopyDataFromHostPtr(v, 4);
   grad.CopyDataFromHostPtr(g, 4);
 
@@ -124,8 +124,8 @@ TEST(SGD, ApplyWithMomentumCuda) {
   const float v[4] = {0.1, 0.2, 0.3, 0.4};
   const float g[4] = {0.01, 0.02, 0.03, 0.04};
 
-  singa::CudaGPU dev;
-  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
   value.CopyDataFromHostPtr(v, 4);
   grad.CopyDataFromHostPtr(g, 4);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_tensor.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index bd039ad..c351174 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -59,10 +59,10 @@ TEST(TensorClass, AsType) {
 
 TEST(TensorClass, ToDevice) {
   Tensor t(Shape{2,3});
-  EXPECT_EQ(static_cast<Device*>(&singa::defaultDevice), t.device());
-  singa::CppCPU *dev = new singa::CppCPU(0, 1);
+  EXPECT_EQ(singa::defaultDevice, t.device());
+  auto dev = std::make_shared<singa::CppCPU>(0, 1);
   t.ToDevice(dev);
-  EXPECT_NE(static_cast<Device*>(&singa::defaultDevice), t.device());
+  EXPECT_NE(singa::defaultDevice, t.device());
 }
 
 TEST(TensorClass, CopyDataFromHostPtr) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5651383f/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index b18e465..0f998c0 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -255,10 +255,10 @@ TEST_F(TestTensorMath, SumColumnsCpp) {
 #ifdef USE_CUDA
 TEST_F(TestTensorMath, MultCuda) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2, 2}, &dev);
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2, 2}, dev);
   t.CopyDataFromHostPtr(x, 4);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   d.CopyDataFromHostPtr(dat1, 6);
   Tensor C = Mult(d, t);
   C.ToHost();
@@ -274,7 +274,7 @@ TEST_F(TestTensorMath, MultCuda) {
   }
 
   const float y[8] = {1.0f, 2.0f, 3.0f, 4.0f, 1.1f, 2.1f, 3.1f, 4.1f};
-  Tensor s(Shape{4, 2}, &dev);
+  Tensor s(Shape{4, 2}, dev);
   s.CopyDataFromHostPtr(y, 8);
   Tensor D = Mult(d, s.T());
   D.ToHost();
@@ -288,11 +288,11 @@ TEST_F(TestTensorMath, MultCuda) {
       EXPECT_FLOAT_EQ(DPtr[i * 4 + j], tmp);
     }
   }
-  Tensor p(Shape{4, 1}, &dev);
+  Tensor p(Shape{4, 1}, dev);
   p.CopyDataFromHostPtr(x, 4);
-  Tensor q(Shape{1, 4}, &dev);
+  Tensor q(Shape{1, 4}, dev);
   q.SetValue(1.0f);
-  Tensor o(Shape{4, 4}, &dev);
+  Tensor o(Shape{4, 4}, dev);
 
   Mult(p, q, &o);
   o.ToHost();
@@ -308,11 +308,11 @@ TEST_F(TestTensorMath, MultCuda) {
 
 TEST_F(TestTensorMath, AddColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   t.CopyDataFromHostPtr(x, 3);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   AddColumn(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -326,11 +326,11 @@ TEST_F(TestTensorMath, AddColumnCuda) {
 
 TEST_F(TestTensorMath, SubColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   t.CopyDataFromHostPtr(x, 3);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   SubColumn(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -357,11 +357,11 @@ TEST_F(TestTensorMath, MultColumnCpp) {
 #ifdef USE_CUDA
 TEST_F(TestTensorMath, MultColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   t.CopyDataFromHostPtr(x, 3);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   MultColumn(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -373,11 +373,11 @@ TEST_F(TestTensorMath, MultColumnCuda) {
 }
 TEST_F(TestTensorMath, DivColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   t.CopyDataFromHostPtr(x, 3);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   DivColumn(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -389,11 +389,11 @@ TEST_F(TestTensorMath, DivColumnCuda) {
 }
 TEST_F(TestTensorMath, AddRowCuda) {
   const float x[2] = {1.1f, 2.1f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   t.CopyDataFromHostPtr(x, 2);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   AddRow(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -405,11 +405,11 @@ TEST_F(TestTensorMath, AddRowCuda) {
 }
 TEST_F(TestTensorMath, SubRowCuda) {
   const float x[2] = {1.1f, 2.1f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   t.CopyDataFromHostPtr(x, 2);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   SubRow(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -421,11 +421,11 @@ TEST_F(TestTensorMath, SubRowCuda) {
 }
 TEST_F(TestTensorMath, MultRowCuda) {
   const float x[2] = {1.1f, 2.1f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   t.CopyDataFromHostPtr(x, 2);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   MultRow(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -452,11 +452,11 @@ TEST_F(TestTensorMath, DivRowCpp) {
 #ifdef USE_CUDA
 TEST_F(TestTensorMath, DivRowCuda) {
   const float x[2] = {1.1f, 2.1f};
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   t.CopyDataFromHostPtr(x, 2);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   DivRow(t, &d);
   d.ToHost();
   const float *xptr = d.data<const float *>();
@@ -467,10 +467,10 @@ TEST_F(TestTensorMath, DivRowCuda) {
   }
 }
 TEST_F(TestTensorMath, SumRowsCuda) {
-  singa::CudaGPU dev;
-  Tensor t(Shape{2}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   SumRows(d, &t);
   t.ToHost();
   const float *tptr = t.data<const float *>();
@@ -484,10 +484,10 @@ TEST_F(TestTensorMath, SumRowsCuda) {
 	d.ToHost();
 }
 TEST_F(TestTensorMath, SumColumnCuda) {
-  singa::CudaGPU dev;
-  Tensor t(Shape{3}, &dev);
+	auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
   d.CopyDataFromHostPtr(dat1, 6);
-  d.ToDevice(&dev);
+  d.ToDevice(dev);
   SumColumns(d, &t);
   t.ToHost();
   const float *tptr = t.data<const float *>();

[2/6] incubator-singa git commit: SINGA-197 Add CNMem as a submodule in lib/

Posted by wa...@apache.org.

SINGA-197 Add CNMem as a submodule in lib/

Compile CNMem library before building singa. A "libcnmem.so" file is generated in BUILD_PATH/lib.
Now it is compatible with low version of cmake.
The cuda sources will not be compiled when USE_CUDA is OFF.
Move singa_config.h to BUILD_PATH/include/singa.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ce3e6dc1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ce3e6dc1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ce3e6dc1

Branch: refs/heads/dev
Commit: ce3e6dc102964ec6ed782cbdd6cb6ca30b41a4db
Parents: 683b3a7
Author: xiezl <xi...@comp.nus.edu.sg>
Authored: Tue Jun 14 15:45:45 2016 +0800
Committer: xiezl <xi...@comp.nus.edu.sg>
Committed: Tue Jun 14 15:45:45 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                      |  6 ++++--
 cmake/Cuda.cmake                    |  1 -
 cmake/Dependencies.cmake            |  4 ++--
 cmake/Protobuf.cmake                | 28 ++++++++++++++++++++++++++++
 include/singa/core/common.h         |  2 +-
 include/singa/core/device.h         |  2 +-
 include/singa/utils/cuda_utils.h    |  2 +-
 src/CMakeLists.txt                  | 19 +++++++++++++------
 src/core/device/cuda_gpu.cc         |  2 +-
 src/core/tensor/math_kernel.cu      |  2 +-
 src/core/tensor/math_kernel.h       |  2 +-
 src/core/tensor/tensor_math_cuda.h  |  2 +-
 src/model/layer/cudnn_activation.cc |  2 +-
 src/model/layer/cudnn_activation.h  |  2 +-
 src/model/layer/cudnn_batchnorm.h   |  2 +-
 src/model/layer/cudnn_convolution.h |  2 +-
 src/model/layer/cudnn_dropout.h     |  2 +-
 src/model/layer/cudnn_lrn.h         |  2 +-
 src/model/layer/cudnn_pooling.h     |  2 +-
 src/model/layer/cudnn_softmax.cc    |  2 +-
 src/model/layer/cudnn_utils.h       |  2 +-
 test/singa/test_cudnn_activation.cc |  2 +-
 test/singa/test_cudnn_softmax.cc    |  2 +-
 test/singa/test_dense.cc            |  2 +-
 test/singa/test_mse.cc              |  2 +-
 test/singa/test_sgd.cc              |  2 +-
 26 files changed, 68 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbe3adc..f6240d2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,8 @@ IF(UNIX OR APPLE)
 ENDIF()
 
 #message(STATUS "${CMAKE_CXX_FLAGS}")
-SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
+SET(SINGA_INCLUDE_DIR
+    "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${PROJECT_BINARY_DIR}")
 #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
 INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
 
@@ -28,7 +29,7 @@ ADD_DEFINITIONS(-DUSE_CMAKE)
 
 CONFIGURE_FILE (
     "${PROJECT_SOURCE_DIR}/cmake/Templates/singa_config.h.in"
-    "${PROJECT_BINARY_DIR}/singa_config.h")
+    "${PROJECT_BINARY_DIR}/include/singa/singa_config.h")
 
 #set(SINGA_CONFIGURE_SRC "${PROJECT_BINARY_DIR}/singa_config.h")
 #LIST(APPEND SRCS ${SINGA_CONFIGURE_SRCS} ${PROJECT_BINARY_DIR}/singa_config.h)
@@ -36,5 +37,6 @@ CONFIGURE_FILE (
 SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 
+ADD_SUBDIRECTORY(lib/cnmem)
 ADD_SUBDIRECTORY(src)
 ADD_SUBDIRECTORY(test)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/cmake/Cuda.cmake
----------------------------------------------------------------------
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index a74c82b..e7af7c9 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -10,7 +10,6 @@ MESSAGE(STATUS "Found cuda_v${CUDA_VERSION}")
 #ADD_DEFINITIONS(-DUSE_CUDA)
 #message(STATUS "linking: ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}")
 
-
 IF(USE_CUDNN)
 #include(cmake/Modules/Cudnn.cmake)
     FIND_PACKAGE(CUDNN REQUIRED)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/cmake/Dependencies.cmake
----------------------------------------------------------------------
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index d3f0b00..fe178d6 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -6,6 +6,7 @@ FIND_PACKAGE( Protobuf REQUIRED )
 INCLUDE_DIRECTORIES(SYSTEM ${PROTOBUF_INCLUDE_DIR})
 MESSAGE(STATUS "proto libs " ${PROTOBUF_LIBRARIES})
 LIST(APPEND singa_linker_libs ${PROTOBUF_LIBRARIES})
+INCLUDE("cmake/Protobuf.cmake")
 
 IF(USE_LMDB)
     FIND_PACKAGE(LMDB REQUIRED)
@@ -14,10 +15,9 @@ IF(USE_LMDB)
     MESSAGE(STATUS "FOUND lmdb at ${LMDB_INCLUDE_DIR}")
 ENDIF()
 
-IF(NOT CPU_ONLY)
+IF(USE_CUDA)
     INCLUDE("cmake/Cuda.cmake")
 ELSE()
-    SET(USE_CUDA FALSE)
     SET(USE_CUDNN FALSE)
 ENDIF()
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/cmake/Protobuf.cmake
----------------------------------------------------------------------
diff --git a/cmake/Protobuf.cmake b/cmake/Protobuf.cmake
new file mode 100644
index 0000000..c72b2c0
--- /dev/null
+++ b/cmake/Protobuf.cmake
@@ -0,0 +1,28 @@
+# copy from cmake source code
+function(PROTOBUF_GENERATE_PYTHON OUTPUT)
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called 
+        without any proto files")
+        return()
+    endif(NOT ARGN)
+
+    set(${OUTPUT})
+    foreach(FIL ${ARGN})
+        get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+        get_filename_component(FIL_WE ${FIL} NAME_WE)
+        get_filename_component(PATH ${FIL} PATH)
+
+        list(APPEND ${OUTPUT} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+
+        add_custom_command(
+            OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+            ARGS --python_out ${CMAKE_CURRENT_BINARY_DIR}
+                 --proto_path ${PATH} ${ABS_FIL}
+            DEPENDS ${ABS_FIL}
+            COMMENT "Running Python protocol buffer compiler on ${FIL}" VERBATIM)
+    endforeach()
+    
+    set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+    set(${OUTPUT} ${${OUTPUT}} PARENT_SCOPE)
+endfunction()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index e6f4c90..e19022e 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -20,7 +20,7 @@
 #define SINGA_CORE_COMMON_H_
 #include <random>
 #include <chrono>
-#include "./singa_config.h"
+#include "./singa/singa_config.h"
 #include "singa/utils/logging.h"
 
 #ifdef USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 56eda70..8c95dc7 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -23,7 +23,7 @@
 #include <vector>
 #include <string>
 #include <functional>
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #include "singa/core/common.h"
 #include "singa/core/memory.h"
 #include "singa/core/scheduler.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/include/singa/utils/cuda_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
index 17eb683..24f3eb9 100644
--- a/include/singa/utils/cuda_utils.h
+++ b/include/singa/utils/cuda_utils.h
@@ -2,7 +2,7 @@
 #ifndef SINGA_UTILS_CUDA_UTILS_H_
 #define SINGA_UTILS_CUDA_UTILS_H_
 
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDA
 #include <cublas_v2.h>
 #include <cuda.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 952f7ee..4949236 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -31,15 +31,22 @@ AUX_SOURCE_DIRECTORY(core/device core_source)
 AUX_SOURCE_DIRECTORY(core/memory core_source)
 AUX_SOURCE_DIRECTORY(core/scheduler core_source)
 AUX_SOURCE_DIRECTORY(core/tensor core_source)
-FILE(GLOB_RECURSE cuda_source core "*.cu")
-SET(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
-SET(CMAKE_CXX_FLAGS "")
-CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
+IF (USE_CUDA)
+    FILE(GLOB_RECURSE cuda_source core "*.cu")
+    SET(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
+    SET(CMAKE_CXX_FLAGS "")
+    IF (CMAKE_BUILD_TYPE MATCHES DEBUG)
+        CUDA_COMPILE(cuda_objs SHARED ${cuda_source} 
+            OPTIONS "-Xcompiler -fPIC -G -g")
+    ELSE (CMAKE_BUILD_TYPE MATCHES  DEBUG)
+        CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
+    ENDIF (CMAKE_BUILD_TYPE MATCHES DEBUG)
+    include_directories("${CMAKE_CURRENT_SOURCE_DIR}/core/tensor")
+    SET(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
+ENDIF (USE_CUDA)
 #message(STATUS "FLAGS ${CMAKE_CXX_FLAGS}")
 #message(STATUS "CORE ${cuda_source}")
 #message(STATUS "OBJ ${cuda_objs}")
-include_directories("${CMAKE_CURRENT_SOURCE_DIR}/core/tensor")
-SET(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
 ADD_LIBRARY(singa_core SHARED ${core_source} ${cuda_objs})
 TARGET_LINK_LIBRARIES(singa_core ${SINGA_LINKER_LIBS})
 LIST(APPEND SINGA_LINKER_LIBS singa_core)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 5d4e1ed..a47f6fe 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDA
 #include <cublas_v2.h>
 #include <cuda.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index aed6add..cc84e5c 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -19,7 +19,7 @@
 *
 *************************************************************/
 
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDA
 #include <cmath>
 #include <algorithm>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5c906a9..4f13a5b 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -22,7 +22,7 @@
 #define SRC_CORE_TENSOR__MATH_KERNEL_H_
 
 
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDA
 
 /// TODO(wangwei) Clean the function APIs as commented in tensor_math.h

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 4a2ba66..6693644 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -18,7 +18,7 @@
 
 #ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
 #define  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDA
 #include "./tensor_math.h"
 #include "./math_kernel.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.cc b/src/model/layer/cudnn_activation.cc
index 8ecbbc7..b924494 100644
--- a/src/model/layer/cudnn_activation.cc
+++ b/src/model/layer/cudnn_activation.cc
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 #include "./cudnn_activation.h"
 #include <cudnn.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_activation.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.h b/src/model/layer/cudnn_activation.h
index b572db7..1483e48 100644
--- a/src/model/layer/cudnn_activation.h
+++ b/src/model/layer/cudnn_activation.h
@@ -18,7 +18,7 @@
 
 #ifndef SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
 #define SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 #include <cudnn.h>
 #include <utility>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_batchnorm.h b/src/model/layer/cudnn_batchnorm.h
index 83258d2..8598b65 100644
--- a/src/model/layer/cudnn_batchnorm.h
+++ b/src/model/layer/cudnn_batchnorm.h
@@ -20,7 +20,7 @@
 ************************************************************/
 #ifndef SINGA_MODEL_LAYER_CUDNN_BATCHNORM_H
 #define SINGA_MODEL_LAYER_CUDNN_BATCHNORM_H
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 
 #include "batchnorm.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.h b/src/model/layer/cudnn_convolution.h
index b86c576..152d797 100644
--- a/src/model/layer/cudnn_convolution.h
+++ b/src/model/layer/cudnn_convolution.h
@@ -18,7 +18,7 @@
 
 #ifndef SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
 #define SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 #include <string>
 #include <utility>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index 7cb185b..da3d1d2 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -18,7 +18,7 @@
 
 #ifndef SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
 #define SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 // cudnn dropout is added in cudnn 5
 #if CUDNN_VERSION_MAJOR >= 5

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_lrn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_lrn.h b/src/model/layer/cudnn_lrn.h
index 0f650fe..cd3bcf1 100644
--- a/src/model/layer/cudnn_lrn.h
+++ b/src/model/layer/cudnn_lrn.h
@@ -20,7 +20,7 @@
 ************************************************************/
 #ifndef SINGA_MODEL_LAYER_CUDNN_LRN_H_
 #define SINGA_MODEL_LAYER_CUDNN_LRN_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 
 #include "lrn.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.h b/src/model/layer/cudnn_pooling.h
index 1a38cd5..a5936ad 100644
--- a/src/model/layer/cudnn_pooling.h
+++ b/src/model/layer/cudnn_pooling.h
@@ -18,7 +18,7 @@
 
 #ifndef SRC_MODEL_LAYER_CUDNN_POOLING_H_
 #define SRC_MODEL_LAYER_CUDNN_POOLING_H_
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 #include <cudnn.h>
 #include <string>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.cc b/src/model/layer/cudnn_softmax.cc
index 85b0c3d..d557062 100644
--- a/src/model/layer/cudnn_softmax.cc
+++ b/src/model/layer/cudnn_softmax.cc
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #include "./cudnn_softmax.h"
 #ifdef USE_CUDNN
 #include <cudnn.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/src/model/layer/cudnn_utils.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_utils.h b/src/model/layer/cudnn_utils.h
index 039a1ac..19c72ec 100644
--- a/src/model/layer/cudnn_utils.h
+++ b/src/model/layer/cudnn_utils.h
@@ -18,7 +18,7 @@
 #ifndef SRC_MODEL_LAYER_CUDNN_UTILS_H_
 #define SRC_MODEL_LAYER_CUDNN_UTILS_H_
 
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 
 #include <cudnn.h>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_activation.cc b/test/singa/test_cudnn_activation.cc
index 892b80b..bed7715 100644
--- a/test/singa/test_cudnn_activation.cc
+++ b/test/singa/test_cudnn_activation.cc
@@ -18,7 +18,7 @@
 * under the License.
 *
 *************************************************************/
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 
 #include "singa/proto/core.pb.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cudnn_softmax.cc b/test/singa/test_cudnn_softmax.cc
index 05783e2..e11be87 100644
--- a/test/singa/test_cudnn_softmax.cc
+++ b/test/singa/test_cudnn_softmax.cc
@@ -18,7 +18,7 @@
 * under the License.
 *
 *************************************************************/
-#include "singa_config.h"
+#include "singa/singa_config.h"
 #ifdef USE_CUDNN
 
 #include "../src/model/layer/cudnn_softmax.h"

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index 5050d7e..052d0e8 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -20,7 +20,7 @@
 *************************************************************/
 #include "../src/model/layer/dense.h"
 #include "gtest/gtest.h"
-#include "singa_config.h"
+#include "singa/singa_config.h"
 
 using singa::Dense;
 TEST(Dense, Setup) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index ccaab7a..2c02273 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -23,7 +23,7 @@
 #include "singa/core/tensor.h"
 #include "singa/core/device.h"
 #include "../src/model/loss/mse.h"
-#include "singa_config.h"
+#include "singa/singa_config.h"
 using singa::Tensor;
 class TestMSE : public ::testing::Test {
  protected:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ce3e6dc1/test/singa/test_sgd.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
index c0b6e2b..71ab15e 100644
--- a/test/singa/test_sgd.cc
+++ b/test/singa/test_sgd.cc
@@ -21,7 +21,7 @@
 
 #include "gtest/gtest.h"
 #include "singa/model/optimizer.h"
-#include "singa_config.h"
+#include "singa/singa_config.h"
 
 TEST(SGD, ApplyWithoutMomentum) {
   singa::SGD sgd;

[5/6] incubator-singa git commit: Merge PR #165 for CnMeM

Posted by wa...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_softmax.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_softmax.cc
index e11be87,53ecb2b..d715b33
--- a/test/singa/test_cudnn_softmax.cc
+++ b/test/singa/test_cudnn_softmax.cc
@@@ -33,75 -35,133 +35,129 @@@ TEST(CudnnSoftmax, Setup) 
  
    singa::LayerConf conf;
    singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
-   softmaxconf->set_axis(2);
- 
-   sft.Setup(conf);
-   sft.InitCudnn(1, singa::kFloat32);
-   EXPECT_EQ(2, sft.Axis());
+   softmaxconf->set_algorithm("fast");
+   sft.Setup(Shape{1}, conf);
+   EXPECT_EQ(CUDNN_SOFTMAX_FAST, sft.Algorithm());
  }
  
- TEST(CudnnSoftmax, Forward) {
-   const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
+ TEST(CudnnSoftmax, Forward1D) {
+   const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
    size_t n = sizeof(x) / sizeof(float);
--  singa::CudaGPU cuda(0, 1);
-   singa::Tensor in(singa::Shape{n}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+   singa::Shape shape = {n};
 -  singa::Tensor in(shape, &cuda);
++  singa::Tensor in(shape, cuda);
    in.CopyDataFromHostPtr<float>(x, n);
  
-   int axis = 1;
    CudnnSoftmax sft;
    singa::LayerConf conf;
    singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
-   softmaxconf->set_axis(axis);
-   sft.Setup(conf);
-   sft.InitCudnn(n, singa::kFloat32);
- 
+   softmaxconf->set_algorithm("accurate");
+   sft.Setup(Shape{1}, conf);
    singa::Tensor out = sft.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
--  out.ToDevice(&host);
-   const float* yptr = out.data<const float*>();
++  out.ToHost();
+   const float* yptr = out.data<float>();
    EXPECT_EQ(n, out.Size());
  
    float* y = new float[n];
    float sigma = 0.f;
    for (size_t i = 0; i < n; i++) sigma += exp(x[i]);
    for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma;
-   EXPECT_FLOAT_EQ(y[0], yptr[0]);
-   EXPECT_FLOAT_EQ(y[4], yptr[4]);
-   EXPECT_FLOAT_EQ(y[5], yptr[5]);
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
  }
  
- TEST(CudnnSoftmax, Backward) {
-   const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -1.0};
+ TEST(CudnnSoftmax, Backward1D) {
+   const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
    size_t n = sizeof(x) / sizeof(float);
--  singa::CudaGPU cuda(0, 1);
-   singa::Tensor in(singa::Shape{n}, &cuda);
+   singa::Shape shape = {n};
 -  singa::Tensor in(shape, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(shape, cuda);
    in.CopyDataFromHostPtr<float>(x, n);
  
-   int axis = 1;
    CudnnSoftmax sft;
    singa::LayerConf conf;
    singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
-   softmaxconf->set_axis(axis);
-   sft.Setup(conf);
+   softmaxconf->set_algorithm("accurate");
+   sft.Setup(Shape{1}, conf);
+ 
    singa::Tensor out = sft.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
--  out.ToDevice(&host);
-   const float* yptr = out.data<const float*>();
++  out.ToHost();
+   const float* yptr = out.data<float>();
  
-   const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
-   singa::Tensor out_diff(singa::Shape{n}, &cuda);
+   const float grad[] = {2.f, -3.f, 1.f, 3.f, -1.f, -2.f};
 -  singa::Tensor out_diff(shape, &cuda);
++  singa::Tensor out_diff(shape, cuda);
    out_diff.CopyDataFromHostPtr<float>(grad, n);
    const auto ret = sft.Backward(singa::kTrain, out_diff);
    singa::Tensor in_diff = ret.first;
--  in_diff.ToDevice(&host);
-   const float* xptr = in_diff.data<const float*>();
++  in_diff.ToHost();
+   const float* xptr = in_diff.data<float>();
  
    float* dx = new float[n];
    float sigma = 0.f;
    for (size_t i = 0; i < n; i++) sigma += grad[i] * yptr[i];
    for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma) * yptr[i];
-   EXPECT_FLOAT_EQ(dx[0], xptr[0]);
-   EXPECT_FLOAT_EQ(dx[4], xptr[4]);
-   EXPECT_FLOAT_EQ(dx[5], xptr[5]);
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+ }
+ 
+ TEST(CudnnSoftmax, Forward2D) {
+   const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batch = 2, c = 3;
 -  singa::CudaGPU cuda(0, 1);
+   singa::Shape shape = {batch, c};
 -  singa::Tensor in(shape, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(shape, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   CudnnSoftmax sft;
+   singa::LayerConf conf;
+   singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+   softmaxconf->set_algorithm("accurate");
+   sft.Setup(Shape{c}, conf);
+ 
+   singa::Tensor out = sft.Forward(singa::kTrain, in);
 -  singa::CppCPU host(0, 1);
 -  out.ToDevice(&host);
++  out.ToHost();
+   const float* yptr = out.data<float>();
+   EXPECT_EQ(n, out.Size());
+ 
+   float* y = new float[n];
+   float* sigma = new float[batch];
+   for (size_t i = 0; i < batch; i++) sigma[i] = 0.f;
+   for (size_t i = 0; i < n; i++) sigma[i / c] += exp(x[i]);
+   for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma[i / c];
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+ 
+ TEST(CudnnSoftmax, Backward2D) {
+   const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batch = 2, c = 3;
 -  singa::CudaGPU cuda(0, 1);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+   singa::Shape shape = {batch, c};
 -  singa::Tensor in(shape, &cuda);
++  singa::Tensor in(shape, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   CudnnSoftmax sft;
+   singa::LayerConf conf;
+   singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+   softmaxconf->set_algorithm("accurate");
+   sft.Setup(Shape{c}, conf);
+ 
+   singa::Tensor out = sft.Forward(singa::kTrain, in);
 -  singa::CppCPU host(0, 1);
 -  out.ToDevice(&host);
++  out.ToHost();
+   const float* yptr = out.data<float>();
+ 
+   const float grad[] = {2.f, -3.f, 1.f, 3.f, -1.f, -2.f};
 -  singa::Tensor out_diff(shape, &cuda);
++  singa::Tensor out_diff(shape, cuda);
+   out_diff.CopyDataFromHostPtr<float>(grad, n);
+   const auto ret = sft.Backward(singa::kTrain, out_diff);
+   singa::Tensor in_diff = ret.first;
 -  in_diff.ToDevice(&host);
++  in_diff.ToHost();
+   const float* xptr = in_diff.data<float>();
+ 
+   float* dx = new float[n];
+   float* sigma = new float[batch];
+   for (size_t i = 0; i < batch; i++) sigma[i] = 0.f;
+   for (size_t i = 0; i < n; i++) sigma[i / c] += grad[i] * yptr[i];
+   for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma[i / c]) * yptr[i];
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
  }
  #endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --cc test/singa/test_dense.cc
index 7ed4d33,a5fd960..363fb6e
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@@ -1,242 -1,245 +1,238 @@@
--/************************************************************
--*
--* Licensed to the Apache Software Foundation (ASF) under one
--* or more contributor license agreements.  See the NOTICE file
--* distributed with this work for additional information
--* regarding copyright ownership.  The ASF licenses this file
--* to you under the Apache License, Version 2.0 (the
--* "License"); you may not use this file except in compliance
--* with the License.  You may obtain a copy of the License at
--*
--*   http://www.apache.org/licenses/LICENSE-2.0
--*
--* Unless required by applicable law or agreed to in writing,
--* software distributed under the License is distributed on an
--* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
--* KIND, either express or implied.  See the License for the
--* specific language governing permissions and limitations
--* under the License.
--*
--*************************************************************/
--#include "../src/model/layer/dense.h"
--#include "gtest/gtest.h"
- #include "singa/singa_config.h"
 -#include "singa_config.h"
--
--using singa::Dense;
 -using singa::Shape;
--TEST(Dense, Setup) {
--  Dense dense;
--  EXPECT_EQ("Dense", dense.layer_type());
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  EXPECT_EQ(3u, dense.num_output());
--  EXPECT_EQ(2u, dense.num_input());
--}
--#ifdef USE_CBLAS
--TEST(Dense, ForwardCpp) {
--  Dense dense;
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  const size_t batchsize = 3, vdim = 2, hdim = 3;
--  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
--  singa::Tensor in(singa::Shape{batchsize, vdim});
--  in.CopyDataFromHostPtr(x, batchsize * vdim);
--
--  // set weight
--  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
--  singa::Tensor weight(singa::Shape{hdim, vdim});
--  weight.CopyDataFromHostPtr(we, hdim * vdim);
--
--  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
--  singa::Tensor bias(singa::Shape{hdim});
--  bias.CopyDataFromHostPtr(bia, hdim);
--
--  dense.set_weight(weight);
--  dense.set_bias(bias);
--
--  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-   const float *outptr1 = out1.data<const float *>();
 -  singa::CppCPU host(0, 1);
 -  const float *outptr1 = out1.data<float>();
--  EXPECT_EQ(9u, out1.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 3; j++)
--      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
--                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
--                      outptr1[i * 3 + j]);
--}
--#endif  // USE_CBLAS
 -#ifdef USE_CUDA
--TEST(Dense, BackwardCpp) {
--  Dense dense;
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  const size_t batchsize = 3, vdim = 2, hdim = 3;
--  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
 -  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{batchsize, vdim});
--  in.CopyDataFromHostPtr(x, batchsize * vdim);
--
--  // set weight
--  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
--  singa::Tensor weight(singa::Shape{hdim, vdim});
--  weight.CopyDataFromHostPtr(we, hdim * vdim);
--
--  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
--  singa::Tensor bias(singa::Shape{hdim});
--  bias.CopyDataFromHostPtr(bia, hdim);
--
--  dense.set_weight(weight);
--  dense.set_bias(bias);
--
--  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
--
--  // grad
--  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
--                                      2.0f, 3.0f, 3.0f, 3.0f};
--  singa::Tensor grad(singa::Shape{batchsize, hdim});
--  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
--
--  const auto ret = dense.Backward(singa::kTrain, grad);
 -  singa::CppCPU host(0, 1);
--  singa::Tensor in_grad = ret.first;
--  singa::Tensor dweight = ret.second.at(0);
--  singa::Tensor dbias = ret.second.at(1);
-   const float *dx = in_grad.data<const float *>();
 -  const float *dx = in_grad.data<float>();
--  EXPECT_EQ(6u, in_grad.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 2; j++)
--      EXPECT_FLOAT_EQ(
--          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
--           dy[i * 3 + 2] * we[2 * 2 + j]),
--          dx[i * 2 + j]);
-   const float *dweightx = dweight.data<const float *>();
 -  const float *dweightx = dweight.data<float>();
--  EXPECT_EQ(6u, dweight.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 2; j++)
--      EXPECT_FLOAT_EQ(
--          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
--           dy[2 * 3 + i] * x[2 * 2 + j]),
--          dweightx[i * 2 + j]);
-   const float *dbiasx = dbias.data<const float *>();
 -  const float *dbiasx = dbias.data<float>();
--  EXPECT_EQ(3u, dbias.Size());
--  for (int i = 0; i < 3; i++)
--    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
--}
 -#endif
--
--#ifdef USE_CUDA
--TEST(Dense, ForwardCuda) {
--  Dense dense;
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  const size_t batchsize = 3, vdim = 2, hdim = 3;
--  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-   auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
-   singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
--  in.CopyDataFromHostPtr(x, batchsize * vdim);
--
--  // set weight
--  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-   singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
 -  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
--  weight.CopyDataFromHostPtr(we, hdim * vdim);
--
--  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-   singa::Tensor bias(singa::Shape{hdim}, cuda);
 -  singa::Tensor bias(singa::Shape{hdim}, &cuda);
--  bias.CopyDataFromHostPtr(bia, hdim);
--
--  dense.set_weight(weight);
--  dense.set_bias(bias);
--
--  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-   out1.ToHost();
-   const float *outptr1 = out1.data<const float *>();
 -  singa::CppCPU host(0, 1);
 -  out1.ToDevice(&host);
 -  const float *outptr1 = out1.data<float>();
--  EXPECT_EQ(9u, out1.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 3; j++)
--      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
--                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
--                      outptr1[i * 3 + j]);
--}
--TEST(Dense, BackwardCuda) {
--  Dense dense;
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  const size_t batchsize = 3, vdim = 2, hdim = 3;
--  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-   auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
-   singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
--  in.CopyDataFromHostPtr(x, batchsize * vdim);
--
--  // set weight
--  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-   singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
 -  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
--  weight.CopyDataFromHostPtr(we, hdim * vdim);
--
--  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-   singa::Tensor bias(singa::Shape{hdim}, cuda);
 -  singa::Tensor bias(singa::Shape{hdim}, &cuda);
--  bias.CopyDataFromHostPtr(bia, hdim);
--
--  dense.set_weight(weight);
--  dense.set_bias(bias);
--
--  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
--
--  // grad
--  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
--                                      2.0f, 3.0f, 3.0f, 3.0f};
-   singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
 -  singa::Tensor grad(singa::Shape{batchsize, hdim}, &cuda);
--  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
--
--  const auto ret = dense.Backward(singa::kTrain, grad);
 -  singa::CppCPU host(0, 1);
--  singa::Tensor in_grad = ret.first;
--  singa::Tensor dweight = ret.second.at(0);
--  singa::Tensor dbias = ret.second.at(1);
-   in_grad.ToHost();
-   const float *dx = in_grad.data<const float *>();
 -  in_grad.ToDevice(&host);
 -  const float *dx = in_grad.data<float>();
--  EXPECT_EQ(6u, in_grad.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 2; j++)
--      EXPECT_FLOAT_EQ(
--          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
--           dy[i * 3 + 2] * we[2 * 2 + j]),
--          dx[i * 2 + j]);
-   dweight.ToHost();
-   const float *dweightx = dweight.data<const float *>();
 -  dweight.ToDevice(&host);
 -  const float *dweightx = dweight.data<float>();
--  EXPECT_EQ(6u, dweight.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 2; j++)
--      EXPECT_FLOAT_EQ(
--          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
--           dy[2 * 3 + i] * x[2 * 2 + j]),
--          dweightx[i * 2 + j]);
-   dbias.ToHost();
-   const float *dbiasx = dbias.data<const float *>();
 -  dbias.ToDevice(&host);
 -  const float *dbiasx = dbias.data<float>();
--  EXPECT_EQ(3u, dbias.Size());
--  for (int i = 0; i < 3; i++)
--    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
--}
--#endif
++/************************************************************
++*
++* Licensed to the Apache Software Foundation (ASF) under one
++* or more contributor license agreements.  See the NOTICE file
++* distributed with this work for additional information
++* regarding copyright ownership.  The ASF licenses this file
++* to you under the Apache License, Version 2.0 (the
++* "License"); you may not use this file except in compliance
++* with the License.  You may obtain a copy of the License at
++*
++*   http://www.apache.org/licenses/LICENSE-2.0
++*
++* Unless required by applicable law or agreed to in writing,
++* software distributed under the License is distributed on an
++* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++* KIND, either express or implied.  See the License for the
++* specific language governing permissions and limitations
++* under the License.
++*
++*************************************************************/
++#include "../src/model/layer/dense.h"
++#include "gtest/gtest.h"
++#include "singa/singa_config.h"
++
++using singa::Dense;
++using singa::Shape;
++TEST(Dense, Setup) {
++  Dense dense;
++  EXPECT_EQ("Dense", dense.layer_type());
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  EXPECT_EQ(3u, dense.num_output());
++  EXPECT_EQ(2u, dense.num_input());
++}
++#ifdef USE_CBLAS
++TEST(Dense, ForwardCpp) {
++  Dense dense;
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  const size_t batchsize = 3, vdim = 2, hdim = 3;
++  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++  singa::Tensor in(singa::Shape{batchsize, vdim});
++  in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++  // set weight
++  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++  singa::Tensor weight(singa::Shape{hdim, vdim});
++  weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++  singa::Tensor bias(singa::Shape{hdim});
++  bias.CopyDataFromHostPtr(bia, hdim);
++
++  dense.set_weight(weight);
++  dense.set_bias(bias);
++
++  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++  const float *outptr1 = out1.data<float>();
++  EXPECT_EQ(9u, out1.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 3; j++)
++      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
++                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
++                      outptr1[i * 3 + j]);
++}
++TEST(Dense, BackwardCpp) {
++  Dense dense;
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  const size_t batchsize = 3, vdim = 2, hdim = 3;
++  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++  singa::Tensor in(singa::Shape{batchsize, vdim});
++  in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++  // set weight
++  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++  singa::Tensor weight(singa::Shape{hdim, vdim});
++  weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++  singa::Tensor bias(singa::Shape{hdim});
++  bias.CopyDataFromHostPtr(bia, hdim);
++
++  dense.set_weight(weight);
++  dense.set_bias(bias);
++
++  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++
++  // grad
++  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
++                                      2.0f, 3.0f, 3.0f, 3.0f};
++  singa::Tensor grad(singa::Shape{batchsize, hdim});
++  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
++
++  const auto ret = dense.Backward(singa::kTrain, grad);
++  singa::Tensor in_grad = ret.first;
++  singa::Tensor dweight = ret.second.at(0);
++  singa::Tensor dbias = ret.second.at(1);
++  const float *dx = in_grad.data<float>();
++  EXPECT_EQ(6u, in_grad.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 2; j++)
++      EXPECT_FLOAT_EQ(
++          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
++           dy[i * 3 + 2] * we[2 * 2 + j]),
++          dx[i * 2 + j]);
++  const float *dweightx = dweight.data<float>();
++  EXPECT_EQ(6u, dweight.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 2; j++)
++      EXPECT_FLOAT_EQ(
++          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
++           dy[2 * 3 + i] * x[2 * 2 + j]),
++          dweightx[i * 2 + j]);
++  const float *dbiasx = dbias.data<float>();
++  EXPECT_EQ(3u, dbias.Size());
++  for (int i = 0; i < 3; i++)
++    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
++}
++#endif  // USE_CBLAS
++
++#ifdef USE_CUDA
++TEST(Dense, ForwardCuda) {
++  Dense dense;
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  const size_t batchsize = 3, vdim = 2, hdim = 3;
++  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
++  in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++  // set weight
++  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++  singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
++  weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++  singa::Tensor bias(singa::Shape{hdim}, cuda);
++  bias.CopyDataFromHostPtr(bia, hdim);
++
++  dense.set_weight(weight);
++  dense.set_bias(bias);
++
++  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++  out1.ToHost();
++  const float *outptr1 = out1.data<float>();
++  EXPECT_EQ(9u, out1.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 3; j++)
++      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
++                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
++                      outptr1[i * 3 + j]);
++}
++TEST(Dense, BackwardCuda) {
++  Dense dense;
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  const size_t batchsize = 3, vdim = 2, hdim = 3;
++  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
++  in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++  // set weight
++  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++  singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
++  weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++  singa::Tensor bias(singa::Shape{hdim}, cuda);
++  bias.CopyDataFromHostPtr(bia, hdim);
++
++  dense.set_weight(weight);
++  dense.set_bias(bias);
++
++  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++
++  // grad
++  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
++                                      2.0f, 3.0f, 3.0f, 3.0f};
++  singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
++  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
++
++  const auto ret = dense.Backward(singa::kTrain, grad);
++  singa::Tensor in_grad = ret.first;
++  singa::Tensor dweight = ret.second.at(0);
++  singa::Tensor dbias = ret.second.at(1);
++  in_grad.ToHost();
++  const float *dx = in_grad.data<float>();
++  EXPECT_EQ(6u, in_grad.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 2; j++)
++      EXPECT_FLOAT_EQ(
++          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
++           dy[i * 3 + 2] * we[2 * 2 + j]),
++          dx[i * 2 + j]);
++  dweight.ToHost();
++  const float *dweightx = dweight.data<float>();
++  EXPECT_EQ(6u, dweight.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 2; j++)
++      EXPECT_FLOAT_EQ(
++          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
++           dy[2 * 3 + i] * x[2 * 2 + j]),
++          dweightx[i * 2 + j]);
++  dbias.ToHost();
++  const float *dbiasx = dbias.data<float>();
++  EXPECT_EQ(3u, dbias.Size());
++  for (int i = 0; i < 3; i++)
++    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
++}
++#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_flatten.cc
----------------------------------------------------------------------
diff --cc test/singa/test_flatten.cc
index 0000000,2a77272..25e00c4
mode 000000,100644..100644
--- a/test/singa/test_flatten.cc
+++ b/test/singa/test_flatten.cc
@@@ -1,0 -1,145 +1,143 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "../src/model/layer/flatten.h"
+ #include "gtest/gtest.h"
+ 
+ using singa::Flatten;
+ using singa::Shape;
+ TEST(Flatten, Setup) {
+   Flatten flt;
+   EXPECT_EQ("Flatten", flt.layer_type());
+ 
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(1);
+ 
+   flt.Setup(Shape{2}, conf);
+   EXPECT_EQ(1, flt.Axis());
+ }
+ 
+ TEST(Flatten, ForwardCPU) {
+   const float x[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                      1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   singa::Shape s = {2, 1, 3, 2};
+   singa::Tensor in(s);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   int axis = 3;
+   Flatten flt;
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(axis);
+   flt.Setup(Shape{1, 3, 2}, conf);
+ 
+   singa::Tensor out = flt.Forward(singa::kTrain, in);
+   EXPECT_EQ(n, out.Size());
+   EXPECT_EQ(6u, out.shape(0));
+   EXPECT_EQ(2u, out.shape(1));
+   const float *yptr = out.data<float>();
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
+ }
+ 
+ TEST(Flatten, BackwardCPU) {
+   // directly use input as the output_grad for backward
+   // note that only the shape of input really matters
+   const float dy[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                       1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+   size_t n = sizeof(dy) / sizeof(float);
+   singa::Tensor in(singa::Shape{2, 1, 3, 2});
+   in.CopyDataFromHostPtr<float>(dy, n);
+ 
+   int axis = 2;
+   Flatten flt;
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(axis);
+   flt.Setup(Shape{1, 3, 2}, conf);
+ 
+   singa::Tensor temp = flt.Forward(singa::kTrain, in);
+   const auto out = flt.Backward(singa::kTrain, temp);
+   const float *xptr = out.first.data<float>();
+   EXPECT_EQ(n, out.first.Size());
+   EXPECT_EQ(2u, out.first.shape(0));
+   EXPECT_EQ(1u, out.first.shape(1));
+   EXPECT_EQ(3u, out.first.shape(2));
+   EXPECT_EQ(2u, out.first.shape(3));
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(Flatten, ForwardGPU) {
+   const float x[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                      1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{2, 1, 3, 2}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>();
++  singa::Tensor in(singa::Shape{2, 1, 3, 2}, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   int axis = 3;
+   Flatten flt;
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(axis);
+   flt.Setup(Shape{1, 3, 2}, conf);
+ 
+   singa::Tensor out = flt.Forward(singa::kTrain, in);
 -  singa::CppCPU host(0, 1);
 -  out.ToDevice(&host);
++  out.ToHost();
+   EXPECT_EQ(n, out.Size());
+   EXPECT_EQ(6u, out.shape(0));
+   EXPECT_EQ(2u, out.shape(1));
+   const float *yptr = out.data<float>();
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
+ }
+ 
+ TEST(Flatten, BackwardGPU) {
+   // directly use input as the output_grad for backward
+   // note that only the shape of input really matters
+   const float dy[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                       1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+   size_t n = sizeof(dy) / sizeof(float);
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{2, 1, 3, 2}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>();
++  singa::Tensor in(singa::Shape{2, 1, 3, 2}, cuda);
+   in.CopyDataFromHostPtr<float>(dy, n);
+ 
+   int axis = 2;
+   Flatten flt;
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(axis);
+   flt.Setup(Shape{1, 3, 2}, conf);
+ 
+   singa::Tensor out = flt.Forward(singa::kTrain, in);
+   const auto ret = flt.Backward(singa::kTrain, out);
 -  singa::CppCPU host(0, 1);
+   singa::Tensor in_diff = ret.first;
 -  in_diff.ToDevice(&host);
++  in_diff.ToHost();
+   const float *xptr = in_diff.data<float>();
+   EXPECT_EQ(n, in_diff.Size());
+   EXPECT_EQ(2u, in_diff.shape(0));
+   EXPECT_EQ(1u, in_diff.shape(1));
+   EXPECT_EQ(3u, in_diff.shape(2));
+   EXPECT_EQ(2u, in_diff.shape(3));
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+ }
+ #endif // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_initializer.cc
----------------------------------------------------------------------
diff --cc test/singa/test_initializer.cc
index 0000000,e99cd79..4631af2
mode 000000,100644..100644
--- a/test/singa/test_initializer.cc
+++ b/test/singa/test_initializer.cc
@@@ -1,0 -1,148 +1,148 @@@
+ /**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+ 
+ #include "singa/model/initializer.h"
+ #include "gtest/gtest.h"
+ 
+ TEST(Initializer, Constant) {
+   singa::init::Constant x;
+   size_t n = 10;
+   singa::Tensor t(singa::Shape{n});
+   singa::FillerConf conf;
+   conf.set_value(3.1f);
+   x.Setup(conf);
+   x.Fill(&t);
+   const float* xPtr = t.data<float>();
+   for (size_t i = 0; i < n; i++)
+     EXPECT_FLOAT_EQ(xPtr[i], 3.1f);
+ }
+ 
+ 
+ TEST(Initializer, Gaussian) {
+   singa::init::Gaussian x;
+   size_t n = 1000;
+   singa::Tensor t(singa::Shape{n});
+   singa::FillerConf conf;
+   conf.set_mean(0.11f);
+   conf.set_std(0.01f);
+   x.Setup(conf);
+   x.Fill(&t);
+   const float* xPtr = t.data<float>();
+   float mean = 0.0f, std = 0.0f;
+   for (size_t i = 0; i < n; i++)
+     mean += xPtr[i];
+   mean /= n;
+   EXPECT_NEAR(mean, 0.11f, 1e-3);
+   for (size_t i = 0; i < n; i++)
+     std += (xPtr[i] - mean) * (xPtr[i] - mean);
+   std /= n;
+   std = sqrt(std);
+   EXPECT_NEAR(std, 0.01f, 1e-3);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(Initializer, ConstantCUDA) {
+   singa::init::Constant x;
 -  singa::CudaGPU dev;
++  auto dev = std::make_shared<singa::CudaGPU>();
+   size_t n = 10;
 -  singa::Tensor t(singa::Shape{n}, &dev);
++  singa::Tensor t(singa::Shape{n}, dev);
+   singa::FillerConf conf;
+   conf.set_value(3.1f);
+   x.Setup(conf);
+   x.Fill(&t);
+   t.ToHost();
+   const float* xPtr = t.data<float>();
+   for (size_t i = 0; i < n; i++)
+     EXPECT_FLOAT_EQ(xPtr[i], 3.1f);
+ 
+ 
+   singa::init::Constant y(-0.1f);
 -  singa::Tensor s(singa::Shape{n}, &dev);
++  singa::Tensor s(singa::Shape{n}, dev);
+   y.Fill(&s);
+   s.ToHost();
+   const float* sPtr = s.data<float>();
+   for (size_t i = 0; i < n; i++)
+     EXPECT_FLOAT_EQ(sPtr[i], -0.1f);
+ }
+ 
+ 
+ TEST(Initializer, GaussianCUDA) {
+   singa::init::Gaussian x;
 -  singa::CudaGPU dev;
++  auto dev = std::make_shared<singa::CudaGPU>();
+   size_t n = 1000;
 -  singa::Tensor t(singa::Shape{n}, &dev);
++  singa::Tensor t(singa::Shape{n}, dev);
+   singa::FillerConf conf;
+   conf.set_mean(0.11f);
+   conf.set_std(0.01f);
+   x.Setup(conf);
+   x.Fill(&t);
+   t.ToHost();
+   const float* tPtr = t.data<float>();
+   float mean = 0.0f, std = 0.0f;
+   for (size_t i = 0; i < n; i++)
+     mean += tPtr[i];
+   mean /= n;
+   EXPECT_NEAR(mean, 0.11f, 1e-2);
+   for (size_t i = 0; i < n; i++)
+     std += (tPtr[i] - mean) * (tPtr[i] - mean);
+   std /= n;
+   std = sqrt(std);
+   EXPECT_NEAR(std, 0.01f, 1e-2);
+ 
+ 
+   singa::init::Gaussian y(1.5f, 0.1f);
 -  singa::Tensor s(singa::Shape{n}, &dev);
++  singa::Tensor s(singa::Shape{n}, dev);
+   y.Fill(&s);
+   s.ToHost();
+   const float* sPtr = s.data<float>();
+   for (size_t i = 0; i < n; i++)
+     mean += sPtr[i];
+   mean /= n;
+   EXPECT_NEAR(mean, 1.5f, 0.1f);
+   for (size_t i = 0; i < n; i++)
+     std += (sPtr[i] - mean) * (sPtr[i] - mean);
+   std /= n;
+   std = sqrt(std);
+   EXPECT_NEAR(std, 0.1f, 0.1f);
+ }
+ 
+ TEST(Initializer, XavierCUDA) {
+   singa::init::Constant x;
 -  singa::CudaGPU dev;
++  auto dev = std::make_shared<singa::CudaGPU>();
+   size_t m = 30, n=40;
 -  singa::Tensor t(singa::Shape{m, n}, &dev);
++  singa::Tensor t(singa::Shape{m, n}, dev);
+   x.Fill(&t);
+   t.ToHost();
+   const float* xPtr = t.data<float>();
+   float mean = 0.0f;
+   float high = -100.0f, low = 100.0f;
+   for (size_t i = 0; i < n; i++) {
+     mean += xPtr[i];
+     if (high < xPtr[i])
+       high = xPtr[i];
+     if (low > xPtr[i])
+       low = xPtr[i];
+   }
+   mean /= m * n;
+   EXPECT_NEAR(mean, 0, 1e-2);
+   float scale = sqrt(6.0f / (m + n));
+   EXPECT_LT(high, scale);
+   EXPECT_GT(low, -scale);
+ }
+ 
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --cc test/singa/test_memory.cc
index 90fc99a,0000000..b0df226
mode 100644,000000..100644
--- a/test/singa/test_memory.cc
+++ b/test/singa/test_memory.cc
@@@ -1,111 -1,0 +1,104 @@@
 +/************************************************************
 +*
 +* Licensed to the Apache Software Foundation (ASF) under one
 +* or more contributor license agreements.  See the NOTICE file
 +* distributed with this work for additional information
 +* regarding copyright ownership.  The ASF licenses this file
 +* to you under the Apache License, Version 2.0 (the
 +* "License"); you may not use this file except in compliance
 +* with the License.  You may obtain a copy of the License at
- * 
++*
 +*   http://www.apache.org/licenses/LICENSE-2.0
- * 
++*
 +* Unless required by applicable law or agreed to in writing,
 +* software distributed under the License is distributed on an
 +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 +* KIND, either express or implied.  See the License for the
 +* specific language governing permissions and limitations
 +* under the License.
 +*
 +*************************************************************/
 +
 +#include "gtest/gtest.h"
 +#include "singa/utils/logging.h"
 +#include "singa/core/memory.h"
 +#include "singa/singa_config.h"
++#include "singa/utils/timer.h"
 +#include <sys/time.h>
 +
 +#ifdef USE_CUDA
 +TEST(CnmemPool, PoolInit) {
- 	singa::CnMemPool pool;
- 	pool.InitPool();
++  singa::CnMemPool pool;
++  pool.InitPool();
 +}
 +
 +TEST(CnmemPool, PoolInitAll) {
- 	singa::CnMemPool pool;
- 	int nDevices;
- 	cudaGetDeviceCount(&nDevices);
- 	CHECK_GE(nDevices,1);
- 	pool.InitPool(nDevices,1000000U,0);
++  singa::CnMemPool pool;
++  int nDevices;
++  cudaGetDeviceCount(&nDevices);
++  CHECK_GE(nDevices, 1);
++  pool.InitPool(nDevices, 32, 0);
 +}
 +
 +TEST(CnmemPool, UsePool) {
- 	singa::CnMemPool pool;
- 	pool.InitPool();
- 	int numOfTests = 10;
- 	int numOfWriteVsRead = 3;
- 	int allocSize = 1000000U;
- 	for(int i = 0; i < numOfTests; i++) {
- 		int** memPtrs = new int*[numOfWriteVsRead];
- 		for(int j = 0; j < numOfWriteVsRead; j++) {
- 			pool.Malloc((void**)(&memPtrs[j]), allocSize); 
- 		}
- 		pool.Free(memPtrs[0]);
- 		delete[] memPtrs;
- 	}
++  singa::CnMemPool pool;
++  pool.InitPool();
++  int numOfTests = 10;
++  int numOfWriteVsRead = 3;
++  int allocSize = 32;
++  for (int i = 0; i < numOfTests; i++) {
++    int** memPtrs = new int* [numOfWriteVsRead];
++    for (int j = 0; j < numOfWriteVsRead; j++) {
++      pool.Malloc((void**)(&memPtrs[j]), allocSize);
++    }
++    pool.Free(memPtrs[0]);
++    delete[] memPtrs;
++  }
 +}
 +
 +TEST(CudaMemPool, UsePool) {
- 	singa::CudaMemPool pool;
- 	int numOfTests = 10;
- 	int numOfWriteVsRead = 3;
- 	int allocSize = 1000000U;
- 	for(int i = 0; i < numOfTests; i++) {
- 		int** memPtrs = new int*[numOfWriteVsRead];
- 		for(int j = 0; j < numOfWriteVsRead; j++) {
- 			pool.Malloc((void**)(&memPtrs[j]), allocSize); 
- 		}
- 		pool.Free(memPtrs[0]);
- 		delete[] memPtrs;
- 	}
++  singa::CudaMemPool pool;
++  int numOfTests = 10;
++  int numOfWriteVsRead = 3;
++  int allocSize = 32;
++  for (int i = 0; i < numOfTests; i++) {
++    int** memPtrs = new int* [numOfWriteVsRead];
++    for (int j = 0; j < numOfWriteVsRead; j++) {
++      pool.Malloc((void**)(&memPtrs[j]), allocSize);
++    }
++    pool.Free(memPtrs[0]);
++    delete[] memPtrs;
++  }
 +}
 +
 +TEST(MemPool, CompareCudaCnmem) {
- 	singa::CudaMemPool cudaPool;
- 	singa::CnMemPool cnPool;
- 	cnPool.InitPool();
++  singa::CudaMemPool cudaPool;
++  singa::CnMemPool cnPool;
++  cnPool.InitPool();
++
++  int numOfTests = 5000;
++  int allocSize = 32;
 +
- 	int numOfTests = 5000;
- 	int allocSize = 1000000U;
- 	struct timeval start,end;
- 	double t1,t2;
++  singa::DeviceMemPool* pool = NULL;
++  pool = &cnPool;
 +
- 	singa::DeviceMemPool* pool = NULL;
- 	pool = &cnPool;
- 	
- 	gettimeofday(&start,NULL);
- 	for(int i = 0; i < numOfTests; i++) {
- 		int* memPtrs = NULL;
- 		pool->Malloc((void**)&memPtrs, allocSize); 	
- 		pool->Free(memPtrs);
- 	}
- 	gettimeofday(&end,NULL);
- 	
- 	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
- 	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
- 	LOG(INFO) << "cnmem memory time: " << t2-t1 << " ms" << std::endl;
++  singa::Timer tick;
++  for (int i = 0; i < numOfTests; i++) {
++    int* memPtrs = NULL;
++    pool->Malloc((void**)&memPtrs, allocSize);
++    pool->Free(memPtrs);
++  }
++  tick.Tick();
++  int cn_time = tick.Elapsed();
 +
- 	pool = &cudaPool;
- 	gettimeofday(&start,NULL);
- 	for(int i = 0; i < numOfTests; i++) {
- 		int* memPtrs = NULL;
- 		pool->Malloc((void**)&memPtrs, allocSize); 
- 		pool->Free(memPtrs);
- 	}
- 	gettimeofday(&end,NULL);
- 	
- 	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
- 	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
- 	LOG(INFO) << "cuda memory time: " << t2-t1 << " ms" << std::endl;
++  pool = &cudaPool;
++  for (int i = 0; i < numOfTests; i++) {
++    int* memPtrs = NULL;
++    pool->Malloc((void**)&memPtrs, allocSize);
++    pool->Free(memPtrs);
++  }
++  tick.Tick();
++  int cuda_time = tick.Elapsed();
++  EXPECT_GE(cuda_time, cn_time);
 +}
- #endif // USE_CUDA
++#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --cc test/singa/test_mse.cc
index d2c5125,928be9d..788652f
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@@ -22,8 -22,9 +22,8 @@@
  #include "gtest/gtest.h"
  #include "singa/core/tensor.h"
  #include "singa/core/device.h"
- #include "../src/model/loss/mse.h"
- #include "singa/singa_config.h"
+ #include "singa/model/loss.h"
 -#include "singa_config.h"
+ 
  using singa::Tensor;
  class TestMSE : public ::testing::Test {
   protected:
@@@ -68,14 -69,14 +68,14 @@@ TEST_F(TestMSE, CppBackward) 
  #endif
  #ifdef USE_CUDA
  TEST_F(TestMSE, CudaForward) {
 -  singa::MSE mse;
 -  singa::CudaGPU dev;
 -  p.ToDevice(&dev);
 -  t.ToDevice(&dev);
 -  Tensor loss = mse.Forward(p, t);
 +  singa::MSE* mse = new singa::MSE();
 +  auto dev = std::make_shared<singa::CudaGPU>();
 +  p.ToDevice(dev);
 +  t.ToDevice(dev);
 +  Tensor loss = mse->Forward(p, t);
  
    loss.ToHost();
-   auto ldat = loss.data<const float*>();
+   auto ldat = loss.data<float>();
  
    for (size_t i = 0, k = 0; i < loss.Size(); i++) {
      float l = 0.f;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_nesterov.cc
----------------------------------------------------------------------
diff --cc test/singa/test_nesterov.cc
index 0000000,35b2b4d..73f69f4
mode 000000,100644..100644
--- a/test/singa/test_nesterov.cc
+++ b/test/singa/test_nesterov.cc
@@@ -1,0 -1,101 +1,101 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
 -#include "singa_config.h"
++#include "singa/singa_config.h"
+ 
+ TEST(Nesterov, ApplyCPU) {
+   singa::Nesterov nesterov;
+   float lr = 0.1f;
+   auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+   nesterov.SetMomentumGenerator(func);
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   nesterov.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   const float* newv1 = v1.data<float>();
+   float history[4], tmp[4];
+   for (int i = 0; i < 4; ++i) {
+     history[i] = g[i] * lr;
+     tmp[i] = history[i] * (1 + func(0));
+   }
+   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   nesterov.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i) {
+     tmp[i] = history[i];
+     history[i] = history[i] * func(1) + g[i] * lr;
+     tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+   }
+ 
+   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(Nesterov, ApplyCUDA) {
+   singa::Nesterov nesterov;
+   float lr = 0.1f;
+   auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+   nesterov.SetMomentumGenerator(func);
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
 -  singa::CudaGPU dev;
 -  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   nesterov.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   v1.ToHost();
+   const float* newv1 = v1.data<float>();
+   float history[4], tmp[4];
+   for (int i = 0; i < 4; ++i) {
+     history[i] = g[i] * lr;
+     tmp[i] = history[i] * (1 + func(0));
+   }
+   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   nesterov.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   v2.ToHost();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i) {
+     tmp[i] = history[i];
+     history[i] = history[i] * func(1) + g[i] * lr;
+     tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+   }
+ 
+   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+ }
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_prelu.cc
----------------------------------------------------------------------
diff --cc test/singa/test_prelu.cc
index 0000000,fee7c5b..dbf5ca6
mode 000000,100644..100644
--- a/test/singa/test_prelu.cc
+++ b/test/singa/test_prelu.cc
@@@ -1,0 -1,247 +1,245 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "../src/model/layer/prelu.h"
+ #include "gtest/gtest.h"
 -#include "singa_config.h"
++#include "singa/singa_config.h"
+ 
+ using singa::PReLU;
+ using singa::Shape;
+ TEST(PReLU, Setup) {
+   PReLU prelu;
+   EXPECT_EQ("PReLU", prelu.layer_type());
+ 
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(true);
+   preluconf->set_format("NHWC");
+ 
+   prelu.Setup(Shape{4}, conf);
+   EXPECT_EQ(true, prelu.Channel_shared());
+   EXPECT_EQ("NHWC", prelu.Format());
+ }
+ 
+ TEST(PReLU, ForwardCPU) {
+   const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                      -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batchsize = 2, c = 3, h = 2, w = 1;
+   singa::Tensor in(singa::Shape{batchsize, h, w, c});
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   PReLU prelu;
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(false);
+   preluconf->set_format("NHWC");
+   prelu.Setup(Shape{h, w, c}, conf);
+ 
+   const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+   singa::Tensor a(singa::Shape{c});
+   a.CopyDataFromHostPtr<float>(neg_slope, c);
+   prelu.Set_a(a);
+ 
+   singa::Tensor out = prelu.Forward(singa::kTrain, in);
+   const float *yptr = out.data<float>();
+   EXPECT_EQ(n, out.Size());
+ 
+   float *y = new float[n];
+   size_t div_factor = prelu.Channel_shared() ? c : 1;
+   if (prelu.Format() == "NCHW") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+     }
+   } else if (prelu.Format() == "NHWC") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+     }
+   }
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+ 
+ TEST(PReLU, BackwardCPU) {
+   const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                      -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batchsize = 2, c = 3, h = 2, w = 1;
+   singa::Tensor in(singa::Shape{batchsize, c, h, w});
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   PReLU prelu;
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(false);
+   preluconf->set_format("NCHW");
+   prelu.Setup(Shape{c, h, w}, conf);
+ 
+   const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+   singa::Tensor a(singa::Shape{c});
+   a.CopyDataFromHostPtr<float>(neg_slope, c);
+   prelu.Set_a(a);
+ 
+   singa::Tensor out = prelu.Forward(singa::kTrain, in);
+ 
+   const float grad[] = {1.f, 2.f,  -2.f, -1.f, -1.f, -3.f,
+                         2.f, -2.f, 1.f,  1.f,  -2.f, 0.f};
+   singa::Tensor out_diff(singa::Shape{batchsize, c, h, w});
+   out_diff.CopyDataFromHostPtr<float>(grad, n);
+   const auto ret = prelu.Backward(singa::kTrain, out_diff);
+   const float *xptr = ret.first.data<float>();
+   const float *aptr = ret.second.at(0).data<float>();
+   float *dx = new float[n];
+   size_t div_factor = prelu.Channel_shared() ? c : 1;
+   size_t params = prelu.Channel_shared() ? 1 : c;
+   float da[] = {0.f, 0.f, 0.f};
+   if (prelu.Format() == "NCHW") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       dx[i] = grad[i] *
+               (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+     }
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       da[pos] += grad[i] * std::min(x[i], 0.f);
+     }
+   } else if (prelu.Format() == "NHWC") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       dx[i] = grad[i] *
+               (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+     }
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       da[pos] += grad[i] * std::min(x[i], 0.f);
+     }
+   }
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+   for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(PReLU, ForwardGPU) {
+   const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                          -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batchsize = 2, c = 3, h = 2, w = 1;
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{batchsize, h, w, c}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>();
++  singa::Tensor in(singa::Shape{batchsize, h, w, c}, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   PReLU prelu;
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(false);
+   preluconf->set_format("NHWC");
+   prelu.Setup(Shape{h, w, c}, conf);
+ 
+   const float neg_slope[] = {0.25f, 0.5f, 0.75f};
 -  singa::Tensor a(singa::Shape{c}, &cuda);
++  singa::Tensor a(singa::Shape{c}, cuda);
+   a.CopyDataFromHostPtr<float>(neg_slope, c);
+   prelu.Set_a(a);
+ 
+   singa::Tensor out = prelu.Forward(singa::kTrain, in);
 -  singa::CppCPU host(0, 1);
 -  out.ToDevice(&host);
++  out.ToHost();
+   const float *yptr = out.data<float>();
+   EXPECT_EQ(n, out.Size());
+ 
+   float *y = new float[n];
+   size_t div_factor = prelu.Channel_shared() ? c : 1;
+   if (prelu.Format() == "NCHW") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+     }
+   } else if (prelu.Format() == "NHWC") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+     }
+   }
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+ 
+ TEST(PReLU, BackwardGPU) {
+   const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                            -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batchsize = 2, c = 3, h = 2, w = 1;
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>();
++  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   PReLU prelu;
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(false);
+   preluconf->set_format("NCHW");
+   prelu.Setup(Shape{c, h, w}, conf);
+ 
+   const float neg_slope[] = {0.25f, 0.5f, 0.75f};
 -  singa::Tensor a(singa::Shape{c}, &cuda);
++  singa::Tensor a(singa::Shape{c}, cuda);
+   a.CopyDataFromHostPtr<float>(neg_slope, c);
+   prelu.Set_a(a);
+ 
+   singa::Tensor out = prelu.Forward(singa::kTrain, in);
+   const float grad[] = {1.f, 2.f,  -2.f, -1.f, -1.f, -3.f,
+                           2.f, -2.f, 1.f,  1.f,  -2.f, 0.f};
 -  singa::Tensor out_diff(singa::Shape{batchsize, c, h, w}, &cuda);
++  singa::Tensor out_diff(singa::Shape{batchsize, c, h, w}, cuda);
+   out_diff.CopyDataFromHostPtr<float>(grad, n);
+   const auto ret = prelu.Backward(singa::kTrain, out_diff);
+ 
+   singa::Tensor in_diff = ret.first;
 -  singa::CppCPU host(0, 1);
 -  in_diff.ToDevice(&host);
++  in_diff.ToHost();
+   const float *xptr = in_diff.data<float>();
+   singa::Tensor a_diff = ret.second.at(0);
 -  a_diff.ToDevice(&host);
++  a_diff.ToHost();
+   const float *aptr = a_diff.data<float>();
+   float *dx = new float[n];
+   size_t div_factor = prelu.Channel_shared() ? c : 1;
+   size_t params = prelu.Channel_shared() ? 1 : c;
+   float da[] = {0.f, 0.f, 0.f};
+   if (prelu.Format() == "NCHW") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       dx[i] = grad[i] *
+                 (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+     }
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       da[pos] += grad[i] * std::min(x[i], 0.f);
+     }
+   } else if (prelu.Format() == "NHWC") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       dx[i] = grad[i] *
+         (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+   }
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       da[pos] += grad[i] * std::min(x[i], 0.f);
+     }
+   }
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+   for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
+ }
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_rmsprop.cc
----------------------------------------------------------------------
diff --cc test/singa/test_rmsprop.cc
index 0000000,004a9b6..18de9c3
mode 000000,100644..100644
--- a/test/singa/test_rmsprop.cc
+++ b/test/singa/test_rmsprop.cc
@@@ -1,0 -1,106 +1,105 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
 -#include "singa_config.h"
+ #include <cmath>
+ 
+ TEST(RMSProp, ApplyCPU) {
+   singa::RMSProp rmsprop;
+   float lr = 0.1f;
+   float rho = 0.9;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::OptimizerConf conf;
+   conf.set_rho(rho);
+   conf.set_delta(1E-8);
+ 
+   singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   rmsprop.Setup(conf);
+   rmsprop.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   const float* newv1 = v1.data<float>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv1[i], v[i] - g[i] * lr / sqrt(history[i] + (float)1E-8),
+                 1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   rmsprop.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i)
+     history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+ 
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv2[i], newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8),
+                 1e-5);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(RMSProp, ApplyCUDA) {
+   singa::RMSProp rmsprop;
+   float lr = 0.1f;
+   float rho = 0.02;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::OptimizerConf conf;
+   conf.set_rho(rho);
+   conf.set_delta(1e-8);
+ 
 -  singa::CudaGPU dev;
 -  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   rmsprop.Setup(conf);
+   rmsprop.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   v1.ToHost();
+   const float* newv1 = v1.data<float>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+                 1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   rmsprop.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   v2.ToHost();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i)
+     history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+ 
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv2[i],
+                 newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_sgd.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_tensor.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --cc test/singa/test_tensor_math.cc
index 0f998c0,a40a848..f8d0351
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@@ -253,12 -507,21 +507,21 @@@ TEST_F(TestTensorMath, SumColumnsCpp) 
  }
  #endif
  #ifdef USE_CUDA
+ TEST_F(TestTensorMath, L2Cuda) {
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3, 2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  Tensor t(Shape{3, 2}, dev);
+   t.CopyDataFromHostPtr(dat1, 6);
+   float l2 = t.L2();
+   float target = 0.0f;
+   for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
+   EXPECT_FLOAT_EQ(l2, sqrt(target));
+ }
  TEST_F(TestTensorMath, MultCuda) {
    const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2, 2}, &dev);
 +  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2, 2}, dev);
    t.CopyDataFromHostPtr(x, 4);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    d.CopyDataFromHostPtr(dat1, 6);
    Tensor C = Mult(d, t);
    C.ToHost();
@@@ -302,20 -565,18 +565,20 @@@
        EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
      }
    }
- 	d.ToHost();
- 	p.ToHost();
++  d.ToHost();
++  p.ToHost();
  }
  
  TEST_F(TestTensorMath, AddColumnCuda) {
    const float x[3] = {1.0f, 2.0f, 3.0f};
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    t.CopyDataFromHostPtr(x, 3);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    AddColumn(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[i]);
@@@ -323,17 -584,16 +586,16 @@@
    }
  }
  
- 
  TEST_F(TestTensorMath, SubColumnCuda) {
    const float x[3] = {1.0f, 2.0f, 3.0f};
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    t.CopyDataFromHostPtr(x, 3);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    SubColumn(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[i]);
@@@ -357,14 -617,14 +619,14 @@@ TEST_F(TestTensorMath, MultColumnCpp) 
  #ifdef USE_CUDA
  TEST_F(TestTensorMath, MultColumnCuda) {
    const float x[3] = {1.0f, 2.0f, 3.0f};
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    t.CopyDataFromHostPtr(x, 3);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    MultColumn(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
@@@ -373,14 -633,14 +635,14 @@@
  }
  TEST_F(TestTensorMath, DivColumnCuda) {
    const float x[3] = {1.0f, 2.0f, 3.0f};
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    t.CopyDataFromHostPtr(x, 3);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    DivColumn(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[i]);
@@@ -389,14 -649,14 +651,14 @@@
  }
  TEST_F(TestTensorMath, AddRowCuda) {
    const float x[2] = {1.1f, 2.1f};
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    t.CopyDataFromHostPtr(x, 2);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    AddRow(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[j]);
@@@ -405,14 -665,14 +667,14 @@@
  }
  TEST_F(TestTensorMath, SubRowCuda) {
    const float x[2] = {1.1f, 2.1f};
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    t.CopyDataFromHostPtr(x, 2);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    SubRow(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[j]);
@@@ -421,14 -681,14 +683,14 @@@
  }
  TEST_F(TestTensorMath, MultRowCuda) {
    const float x[2] = {1.1f, 2.1f};
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    t.CopyDataFromHostPtr(x, 2);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    MultRow(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[j]);
@@@ -452,14 -712,14 +714,14 @@@ TEST_F(TestTensorMath, DivRowCpp) 
  #ifdef USE_CUDA
  TEST_F(TestTensorMath, DivRowCuda) {
    const float x[2] = {1.1f, 2.1f};
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    t.CopyDataFromHostPtr(x, 2);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    DivRow(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
@@@ -467,13 -727,13 +729,13 @@@
    }
  }
  TEST_F(TestTensorMath, SumRowsCuda) {
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    SumRows(d, &t);
    t.ToHost();
-   const float *tptr = t.data<const float *>();
+   const float *tptr = t.data<float>();
    for (int i = 0; i < 2; i++) {
      float tmp = 0;
      for (int j = 0; j < 3; j++) {
@@@ -481,16 -741,15 +743,16 @@@
      }
      EXPECT_FLOAT_EQ(tptr[i], tmp);
    }
- 	d.ToHost();
++  d.ToHost();
  }
  TEST_F(TestTensorMath, SumColumnCuda) {
- 	auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    SumColumns(d, &t);
    t.ToHost();
-   const float *tptr = t.data<const float *>();
+   const float *tptr = t.data<float>();
    for (int i = 0; i < 3; i++) {
      float tmp = 0;
      for (int j = 0; j < 2; j++) {
@@@ -498,6 -757,120 +760,121 @@@
      }
      EXPECT_FLOAT_EQ(tptr[i], tmp);
    }
- 	d.ToHost();
++  d.ToHost();
  }
+ 
+ #endif
+ 
+ TEST_F(TestTensorMath, ConcatenateRowsCpp) {
+   d.CopyDataFromHostPtr<float>(dat1, 6);
+   e.CopyDataFromHostPtr<float>(dat2, 6);
+   const auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+   EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+   EXPECT_EQ(ret.shape(1), d.shape(1));
+   const float *retPtr = ret.data<float>();
+   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+ }
+ 
+ TEST_F(TestTensorMath, ConcatenateColumnsCpp) {
+   d.CopyDataFromHostPtr<float>(dat1, 6);
+   e.CopyDataFromHostPtr<float>(dat2, 6);
+   const auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+   EXPECT_EQ(ret.shape(0), d.shape(0));
+   EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+ 
+   const float *retPtr = ret.data<float>();
+   for (int i = 0; i < 3; i++) {
+     for (int j = 0; j < 2; j++)
+       EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+     for (int j = 0; j < 2; j++)
+       EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+   }
+ }
+ 
+ TEST_F(TestTensorMath, CopyRowsCpp) {
+   const auto ret = singa::CopyRows(e, 1, 2);
+   EXPECT_EQ(ret.shape(0), 1u);
+   EXPECT_EQ(ret.shape(1), e.shape(1));
+   const float *retPtr = ret.data<float>();
+   for (size_t i = 0; i < ret.Size(); i++)
+     EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+ }
+ 
+ TEST_F(TestTensorMath, CopyColumnsCpp) {
+   a.Reshape(Shape{2, 3});
+   const auto ret = singa::CopyColumns(a, 1, 3);
+   EXPECT_EQ(ret.shape(0), a.shape(0));
+   EXPECT_EQ(ret.shape(1), 2u);
+   const float *retPtr = ret.data<float>();
+   for (size_t i = 0; i < ret.shape(0); i++)
+     for (size_t j = 0; j < ret.shape(1); j++)
+       EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+                       dat1[i * a.shape(1) + j + 1]);
+ }
+ 
+ #ifdef USE_CUDA
+ 
+ TEST_F(TestTensorMath, ConcatenateRowsCuda) {
 -  singa::CudaGPU dev;
 -  d.ToDevice(&dev);
 -  e.ToDevice(&dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  d.ToDevice(dev);
++  e.ToDevice(dev);
+   d.CopyDataFromHostPtr<float>(dat1, 6);
+   e.CopyDataFromHostPtr<float>(dat2, 6);
+   auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+   EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+   EXPECT_EQ(ret.shape(1), d.shape(1));
+   ret.ToHost();
+   const float *retPtr = ret.data<float>();
+   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+ }
+ 
+ TEST_F(TestTensorMath, ConcatenateColumnsCuda) {
 -  singa::CudaGPU dev;
 -  d.ToDevice(&dev);
 -  e.ToDevice(&dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  d.ToDevice(dev);
++  e.ToDevice(dev);
+   d.CopyDataFromHostPtr<float>(dat1, 6);
+   e.CopyDataFromHostPtr<float>(dat2, 6);
+   auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+   ret.ToHost();
+   EXPECT_EQ(ret.shape(0), d.shape(0));
+   EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+ 
+   const float *retPtr = ret.data<float>();
+   for (int i = 0; i < 3; i++) {
+     for (int j = 0; j < 2; j++)
+       EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+     for (int j = 0; j < 2; j++)
+       EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+   }
+ }
+ 
+ TEST_F(TestTensorMath, CopyRowsCuda) {
 -  singa::CudaGPU dev;
 -  e.ToDevice(&dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  e.ToDevice(dev);
+   auto ret = singa::CopyRows(e, 1, 2);
+   ret.ToHost();
+   EXPECT_EQ(ret.shape(0), 1u);
+   EXPECT_EQ(ret.shape(1), e.shape(1));
+   const float *retPtr = ret.data<float>();
+   for (size_t i = 0; i < ret.Size(); i++)
+     EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+ }
+ 
+ TEST_F(TestTensorMath, CopyColumnsCuda) {
 -  singa::CudaGPU dev;
++  auto dev = std::make_shared<singa::CudaGPU>();
+   a.Reshape(Shape{2, 3});
 -  a.ToDevice(&dev);
++  a.ToDevice(dev);
+   auto ret = singa::CopyColumns(a, 1, 3);
+   EXPECT_EQ(ret.shape(0), a.shape(0));
+   EXPECT_EQ(ret.shape(1), 2u);
+   ret.ToHost();
+   const float *retPtr = ret.data<float>();
+   for (size_t i = 0; i < ret.shape(0); i++)
+     for (size_t j = 0; j < ret.shape(1); j++)
+       EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+                       dat1[i * a.shape(1) + j + 1]);
+ }
+ 
  #endif

[6/6] incubator-singa git commit: Merge PR #165 for CnMeM

Posted by wa...@apache.org.

Merge PR #165  for CnMeM

Fixbugs from device type (Device* -> std::shared_ptr<Device>).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/dd08f413
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/dd08f413
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/dd08f413

Branch: refs/heads/dev
Commit: dd08f413015878365fed32e579c1b7f4ecc81270
Parents: 5651383 9abd791
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Fri Jun 24 13:41:02 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Fri Jun 24 14:49:56 2016 +0800

----------------------------------------------------------------------
 .gitignore                              |   1 +
 CMakeLists.txt                          |   9 +-
 cmake/Dependencies.cmake                |  12 +
 cmake/Templates/singa_config.h.in       |   2 +
 cmake/Utils.cmake                       |  15 +
 include/singa/core/common.h             |  32 +-
 include/singa/core/device.h             |  18 +-
 include/singa/core/memory.h             |  62 +-
 include/singa/core/tensor.h             | 464 +++++++-------
 include/singa/io/decoder.h              |  56 ++
 include/singa/io/encoder.h              |  61 ++
 include/singa/io/reader.h               |  99 +++
 include/singa/io/writer.h               | 112 ++++
 include/singa/model/initializer.h       | 105 ++++
 include/singa/model/layer.h             |  48 +-
 include/singa/model/loss.h              |  47 ++
 include/singa/model/optimizer.h         |  59 +-
 include/singa/utils/channel.h           |  85 +++
 include/singa/utils/timer.h             |   2 +-
 src/CMakeLists.txt                      |  18 +
 src/core/device/cpp_cpu.cc              |   2 +-
 src/core/device/cuda_gpu.cc             |  88 ++-
 src/core/device/device.cc               |  24 +-
 src/core/memory/memory.cc               |  83 +--
 src/core/tensor/math_kernel.cu          | 682 +++++++++++---------
 src/core/tensor/math_kernel.h           |  98 +--
 src/core/tensor/tensor.cc               | 896 ++++++++++++++++-----------
 src/core/tensor/tensor_math.h           | 418 +++++++------
 src/core/tensor/tensor_math_cpp.h       | 629 ++++++++++++++-----
 src/core/tensor/tensor_math_cuda.h      | 429 ++++++++++---
 src/io/binfile_reader.cc                | 113 ++++
 src/io/binfile_writer.cc                | 136 ++++
 src/io/jpg2proto_encoder.cc             |  83 +++
 src/io/proto2jpg_decoder.cc             |  75 +++
 src/model/layer/activation.cc           |  27 +-
 src/model/layer/activation.h            |   7 +-
 src/model/layer/batchnorm.cc            |  11 +-
 src/model/layer/batchnorm.h             |  12 +-
 src/model/layer/convolution.cc          |  13 +-
 src/model/layer/convolution.h           |   7 +-
 src/model/layer/cudnn_activation.cc     |  33 +-
 src/model/layer/cudnn_activation.h      |  11 +-
 src/model/layer/cudnn_batchnorm.cc      | 132 ++--
 src/model/layer/cudnn_batchnorm.h       |  40 +-
 src/model/layer/cudnn_convolution.cc    | 114 ++--
 src/model/layer/cudnn_convolution.h     |   4 +-
 src/model/layer/cudnn_dropout.cc        |  52 +-
 src/model/layer/cudnn_dropout.h         |   4 +-
 src/model/layer/cudnn_lrn.cc            |  78 +--
 src/model/layer/cudnn_lrn.h             |  32 +-
 src/model/layer/cudnn_pooling.cc        |  48 +-
 src/model/layer/cudnn_pooling.h         |   4 +-
 src/model/layer/cudnn_softmax.cc        |  62 +-
 src/model/layer/cudnn_softmax.h         |  11 +-
 src/model/layer/dense.cc                |   7 +-
 src/model/layer/dense.h                 |   6 +-
 src/model/layer/dropout.cc              |   5 +-
 src/model/layer/dropout.h               |   7 +-
 src/model/layer/flatten.cc              |  53 ++
 src/model/layer/flatten.h               |  56 ++
 src/model/layer/lrn.cc                  |   5 +-
 src/model/layer/lrn.h                   |  13 +-
 src/model/layer/pooling.cc              |  13 +-
 src/model/layer/pooling.h               |   8 +-
 src/model/layer/prelu.cc                | 145 +++++
 src/model/layer/prelu.h                 |  66 ++
 src/model/layer/softmax.cc              |  34 +-
 src/model/layer/softmax.h               |  11 +-
 src/model/loss/mse.cc                   |  41 ++
 src/model/loss/mse.h                    |  66 --
 src/model/loss/softmax_cross_entropy.cc |  53 ++
 src/model/metric/accuracy.h             |   5 +-
 src/model/optimizer/adagrad.cc          |  41 ++
 src/model/optimizer/nesterov.cc         |  49 ++
 src/model/optimizer/optimizer.cc        |   2 +-
 src/model/optimizer/rmsprop.cc          |  45 ++
 src/model/optimizer/sgd.cc              |  10 +-
 src/proto/core.proto                    |   7 +-
 src/proto/io.proto                      |  37 ++
 src/proto/model.proto                   |  26 +-
 src/python/device.py                    |  82 +++
 src/python/example_layer.py             |  25 +
 src/python/layer.py                     |  78 ++-
 src/python/swig/core_device.i           |  60 ++
 src/python/swig/core_tensor.i           | 263 ++++++++
 src/python/swig/model_layer.i           |  83 +++
 src/python/swig/singa.i                 |  27 +
 src/python/tensor.py                    | 370 +++++++++++
 src/utils/channel.cc                    | 104 ++++
 test/CMakeLists.txt                     |   3 +-
 test/python/example_test_device.py      |  36 ++
 test/python/example_test_tensor.py      | 179 ++++++
 test/python/unittest_python.py          | 139 +++++
 test/singa/test_activation.cc           |  13 +-
 test/singa/test_adagrad.cc              |  96 +++
 test/singa/test_binfile_rw.cc           |  95 +++
 test/singa/test_channel.cc              |  39 ++
 test/singa/test_cpp_cpu.cc              |  16 +-
 test/singa/test_cross_entropy.cc        | 116 ++++
 test/singa/test_cudnn_activation.cc     |  36 +-
 test/singa/test_cudnn_batchnorm.cc      |  59 +-
 test/singa/test_cudnn_convolution.cc    | 105 ++--
 test/singa/test_cudnn_dropout.cc        |  35 +-
 test/singa/test_cudnn_lrn.cc            |  28 +-
 test/singa/test_cudnn_pooling.cc        |  36 +-
 test/singa/test_cudnn_softmax.cc        | 130 ++--
 test/singa/test_decoder.cc              |  84 +++
 test/singa/test_dense.cc                | 480 +++++++-------
 test/singa/test_dropout.cc              |  17 +-
 test/singa/test_flatten.cc              | 143 +++++
 test/singa/test_initializer.cc          | 148 +++++
 test/singa/test_memory.cc               | 129 ++--
 test/singa/test_mse.cc                  |  12 +-
 test/singa/test_nesterov.cc             | 101 +++
 test/singa/test_prelu.cc                | 245 ++++++++
 test/singa/test_rmsprop.cc              | 105 ++++
 test/singa/test_sgd.cc                  |  32 +-
 test/singa/test_softmax.cc              |  36 +-
 test/singa/test_tensor.cc               |  14 +-
 test/singa/test_tensor_math.cc          | 505 +++++++++++++--
 120 files changed, 8172 insertions(+), 2708 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/CMakeLists.txt
----------------------------------------------------------------------
diff --cc CMakeLists.txt
index c34b6ce,87b3a5d..7a5caf3
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -10,22 -10,23 +10,23 @@@ LIST(APPEND CMAKE_MODULE_PATH ${PROJECT
  IF(UNIX OR APPLE)
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
  ENDIF()
- 
+ IF(CMAKE_BUILD_TYPE=Debug)
+   SET(NVCC_FLAG "${NVCC_FLAG} -g -G ")
+ ENDIF()
  #message(STATUS "${CMAKE_CXX_FLAGS}")
 -SET(SINGA_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/include;${PROJECT_BINARY_DIR}")
 -#message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
 +SET(SINGA_INCLUDE_DIR
-     #"${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${CMAKE_SOURCE_DIR}/lib/cnmen/include;${PROJECT_BINARY_DIR}")
 +    "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/include;${PROJECT_BINARY_DIR}")
- #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
  INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
  
- #OPTION(CPU_ONLY "use GPU libs" OFF)
  OPTION(USE_CBLAS "Use CBlas libs" ON)
  OPTION(USE_CUDA "Use Cuda libs" ON)
 -OPTION(USE_CUDNN "Use Cudnn libs" ON)
 +OPTION(USE_CUDNN "Use Cudnn libs" OFF)
  OPTION(USE_OPENCV "Use opencv" OFF)
  OPTION(USE_LMDB "Use LMDB libs" OFF)
+ OPTION(USE_PYTHON "Generate py wrappers" OFF)
  
  INCLUDE("cmake/Dependencies.cmake")
+ INCLUDE("cmake/Utils.cmake")
  ADD_DEFINITIONS(-DUSE_CMAKE)
  
  CONFIGURE_FILE (

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/cmake/Dependencies.cmake
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/common.h
----------------------------------------------------------------------
diff --cc include/singa/core/common.h
index e19022e,22a2b49..cb1bdca
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@@ -20,7 -20,9 +20,9 @@@
  #define SINGA_CORE_COMMON_H_
  #include <random>
  #include <chrono>
 +#include "./singa/singa_config.h"
+ #include <atomic>
+ #include <memory>
 -#include "./singa_config.h"
  #include "singa/utils/logging.h"
  
  #ifdef USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/device.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/memory.h
----------------------------------------------------------------------
diff --cc include/singa/core/memory.h
index e4e1e63,db09043..c35f5d0
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@@ -19,56 -19,10 +19,58 @@@
  #ifndef SINGA_CORE_MEMORY_H_
  #define SINGA_CORE_MEMORY_H_
  
 +#include "cnmem.h"
++#include "singa/singa_config.h"
 +#include <mutex>
 +
  namespace singa {
  
  /// Manage device memory pool including garbage collection, memory opt.
  class VirtualMemory {};
  
 +class DeviceMemPool {
- 	public:
- 	virtual void InitPool() = 0;
- 	virtual void Malloc(void** ptr, const size_t size) = 0;
- 	virtual void Free(void* ptr) = 0;
- 	virtual ~DeviceMemPool(){};
++ public:
++  virtual void InitPool() = 0;
++  virtual void Malloc(void** ptr, const size_t size) = 0;
++  virtual void Free(void* ptr) = 0;
++  virtual ~DeviceMemPool(){};
 +};
 +
++#ifdef USE_CUDA
 +class CnMemPool : public DeviceMemPool {
- 	public:
- 	int status = 1;
++ public:
++  int status = 1;
 +
- 	void InitPool();
++  void InitPool();
 +
- 	/// numDevices: total number of available GPU cards.
- 	/// initSize: all devices will be allocated with this size 
- 	/// manager_flags: pool manager flag (one for all devices)
- 	/// flag = 0; default flag
- 	/// flag = 1: Prevent the manager from growing its memory consumption
- 	/// flag = 2; Prevent the manager from stealing memory.
- 	void InitPool(int numDevices, size_t initSize, unsigned flag);
++  /// numDevices: total number of available GPU cards.
++  /// initSize: all devices will be allocated with this size
++  /// manager_flags: pool manager flag (one for all devices)
++  /// flag = 0; default flag
++  /// flag = 1: Prevent the manager from growing its memory consumption
++  /// flag = 2; Prevent the manager from stealing memory.
++  void InitPool(int numDevices, size_t initSize, unsigned flag);
 +
- 	void Malloc(void** ptr, const size_t size);
- 	void Free(void* ptr);
++  void Malloc(void** ptr, const size_t size);
++  void Free(void* ptr);
 +
- 	// release all memory and set cnmem manager to unintialized 
- 	~CnMemPool();
++  // release all memory and set cnmem manager to unintialized
++  ~CnMemPool();
 +
- 	private:
- 	// whether the (global) memory pool has been initialized
- 	static bool initialized;
- 	// lock on the initialized variable
- 	static std::mutex mtx;
++ private:
++  // whether the (global) memory pool has been initialized
++  static bool initialized;
++  // lock on the initialized variable
++  static std::mutex mtx;
 +};
 +
 +class CudaMemPool : public DeviceMemPool {
- 	public:
- 	void InitPool(){};
- 	void Malloc(void** ptr, const size_t size);
- 	void Free(void* ptr);
- 	~CudaMemPool(){};
++ public:
++  void InitPool(){};
++  void Malloc(void** ptr, const size_t size);
++  void Free(void* ptr);
++  ~CudaMemPool(){};
 +};
- 
++#endif
  }  // namespace singa
  #endif  // SINGA_CORE_MEMORY_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --cc include/singa/core/tensor.h
index 8f73047,eb72bd3..a4f42db
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@@ -65,27 -54,28 +54,29 @@@ class Tensor 
   public:
    ~Tensor();
    Tensor();
 -  explicit Tensor(Shape &&shape, const DataType dtype = kFloat32);
 -  explicit Tensor(const Shape &shape, const DataType dtype = kFloat32);
 -  Tensor(Shape &&shape, Device *dev, const DataType dtype = kFloat32);
 -  Tensor(const Shape &shape, Device *dev, const DataType dtype = kFloat32);
 +  explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
 +  explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
 +  Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
-   Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
++  Tensor(const Shape &shape, std::shared_ptr<Device> dev,
++         DataType dtype = kFloat32);
  
    /// Copy Tensor to share the internal data.  No deep copy.
    Tensor(const Tensor &from);
    /// Copy Tensor to share the internal data.  No deep copy.
    Tensor(Tensor &&from);
  
-   /// For functions in xx_math.cc to access the blob.
-   /// Users should not operate against Blob directly.
-   /// blob_ is allocated in constructors.
-   Blob *blob() const { return blob_; }
+   /// For functions in xx_math.cc to access the block.
+   /// Users should not operate against Block directly.
+   /// block_ is allocated in constructors.
+   Block *block() const { return block_; }
+   void SetBlock(Block* block);
  
 -  Device *device() const { return device_; }
 +  std::shared_ptr<Device> device() const { return device_; }
  
-   /// Return immutable Tensor values with given type.
-   template <typename DType>
-   DType data() const {
-     return static_cast<DType>(blob()->data());
+   /// return immutable Tensor values with given type.
+   template <typename SType>
+   const SType* data() const {
+     return static_cast<const SType*>(block()->data());
    }
  
    /// data type, including kFloat16, kFloat32, kInt
@@@ -192,13 -179,22 +180,22 @@@
   protected:
    bool transpose_ = false;
    DataType data_type_ = kFloat32;
 -  Device *device_ = nullptr;
 +  std::shared_ptr<Device> device_ = nullptr;
-   /// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
-   /// If you want to get an allocated Blob, use blob() instead of blob_.
-   Blob *blob_ = nullptr;
-   Shape shape_;
+   /// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
+   /// If you want to get an allocated Block, use block() instead of block_.
+   Block *block_ = nullptr;
+   Shape shape_ = {};
  };
  
+ typedef Shape::iterator ShapeIter;
+ inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
+   if (len == 0) len = shape.size();
+   CHECK_LE(len, shape.size());
+   size_t v = 1;
+   for (unsigned int i = start; i < len; i++) v *= shape[i];
+   return v;
+ }
+ 
  inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
    CHECK_EQ(in1.data_type(), in2.data_type());
    CHECK_EQ(in1.device()->lang(), in2.device()->lang());

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/model/layer.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/model/loss.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/include/singa/utils/timer.h
----------------------------------------------------------------------
diff --cc include/singa/utils/timer.h
index a54829d,a54829d..bdd6c5c
--- a/include/singa/utils/timer.h
+++ b/include/singa/utils/timer.h
@@@ -19,7 -19,7 +19,7 @@@ class Timer 
    /// Return the duration since last call to Tick() or since the creation of
    /// Timer. The template arg must be from Second or Millisecond or Hour.
    /// The returned value is the count of the time metric.
--  template <typename T>
++  template <typename T = Milliseconds>
    int Elapsed() const {
      static_assert(std::is_same<T, Seconds>::value ||
                        std::is_same<T, Milliseconds>::value ||

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/CMakeLists.txt
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/cpp_cpu.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --cc src/core/device/cuda_gpu.cc
index 4da292f,5d4e1ed..5879c58
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@@ -32,8 -32,8 +32,7 @@@ const cudaMemcpyKind copyKind[] = {cuda
                                     cudaMemcpyDeviceToDevice};
  
  CudaGPU::~CudaGPU() {
--  if (ctx_.cublas_handle)
--    CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
++  if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
    if (ctx_.curand_generator)
      CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
  #ifdef USE_CUDNN
@@@ -42,14 -42,13 +41,12 @@@
      CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
    }
  #endif
- 	delete pool;
++  delete pool;
  }
  
--CudaGPU::CudaGPU(int id, int num_executors,
--                       string scheduler, string vm)
++CudaGPU::CudaGPU(int id, int num_executors, string scheduler, string vm)
      : Device(id, num_executors, scheduler, vm) {
--  if (id == -1)
--    id = FindDevice(0);
++  if (id == -1) id = FindDevice(0);
    lang_ = kCuda;
    ctx_.stream = NULL;  // use the default sync stream
    // TODO(wangwei) create one handle for each steam?
@@@ -68,62 -67,20 +65,57 @@@
    auto status = cudnnCreate(&ctx_.cudnn_handle);
    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
  #endif  // USE_CUDNN
- 	
- 	// initialize cnmem memory management as default
- 	pool = new CnMemPool();
- 	((CnMemPool*)pool)->InitPool();
++
++  // initialize cnmem memory management as default
++  pool = new CnMemPool();
++  ((CnMemPool*)pool)->InitPool();
  }
  
- CudaGPU::CudaGPU(const MemPoolConf& mem_conf,int id, int num_executors,
-                        string scheduler)
 -void CudaGPU::SetRandSeed(unsigned seed) {
 -  CHECK(ctx_.curand_generator);
++CudaGPU::CudaGPU(const MemPoolConf& mem_conf, int id, int num_executors,
++                 string scheduler)
 +    : Device(id, num_executors, scheduler, "gc-only") {
-   if (id == -1)
-     id = FindDevice(0);
++  if (id == -1) id = FindDevice(0);
 +  lang_ = kCuda;
 +  ctx_.stream = NULL;  // use the default sync stream
 +  // TODO(wangwei) create one handle for each steam?
 +  CUDA_CHECK(cudaSetDevice(FindDevice(0)));
 +  // use curandCreateGeneratorHost for CudaHost device
    CURAND_CHECK(
 -      curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
 +      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
 +  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
 +  SetRandSeed(seed);
 +  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
 +  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
 +  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
 +
 +#ifdef USE_CUDNN
 +  // TODO(wangwei) create one handle for each stream?
 +  auto status = cudnnCreate(&ctx_.cudnn_handle);
 +  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
 +#endif  // USE_CUDNN
 +
- 	// initialize memory management for cuda devices
- 	string memoryPoolType = mem_conf.type();
- 	if(memoryPoolType.compare("cnmem") == 0) {
- 		pool = new CnMemPool();
- 		int num_devices = mem_conf.num_devices();
- 		size_t alloc_size = mem_conf.alloc_size();
- 		unsigned flag = mem_conf.cnmemflag();
- 		((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
- 	}
- 	else {
- 		pool = new CudaMemPool();
- 	}
++  // initialize memory management for cuda devices
++  string memoryPoolType = mem_conf.type();
++  if (memoryPoolType.compare("cnmem") == 0) {
++    pool = new CnMemPool();
++    int num_devices = mem_conf.num_devices();
++    size_t alloc_size = mem_conf.alloc_size();
++    unsigned flag = mem_conf.cnmemflag();
++    ((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
++  } else {
++    pool = new CudaMemPool();
++  }
  }
  
 -void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) {
 -  fn(&ctx_);
 +void CudaGPU::SetRandSeed(unsigned seed) {
 +  CHECK(ctx_.curand_generator);
-   CURAND_CHECK(
-       curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
++  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
  }
  
- void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) {
-   fn(&ctx_);
- }
++void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); }
 +
  void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
--                            CopyDirection direction, Context* ctx) {
++                         CopyDirection direction, Context* ctx) {
    cudaMemcpy(dst, src, nBytes, copyKind[direction]);
    // TODO(wangwei) use async copy
    // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
@@@ -133,22 -90,19 +125,21 @@@
  void* CudaGPU::Malloc(int size) {
    void* ptr = nullptr;
    if (size > 0) {
- 		//CUDA_CHECK(cudaMalloc((void**)&ptr,size));
- 		pool->Malloc((void**)&ptr,size);
 -    CUDA_CHECK(cudaMalloc(&ptr, size));
++    // CUDA_CHECK(cudaMalloc((void**)&ptr,size));
++    pool->Malloc((void**)&ptr, size);
      CUDA_CHECK(cudaMemset(ptr, 0, size));
    }
    return ptr;
  }
  
--  /// Free cpu memory.
++/// Free cpu memory.
  void CudaGPU::Free(void* ptr) {
 -  if (ptr != nullptr)
 -    CUDA_CHECK(cudaFree(ptr));
 +  if (ptr != nullptr) {
- 		//CUDA_CHECK(cudaFree(ptr));
- 		pool->Free(ptr);
- 	}
++    // CUDA_CHECK(cudaFree(ptr));
++    pool->Free(ptr);
++  }
  }
  
--
  // ==========Following code is from Caffe src/caffe/common.cpp=================
  
  void CudaGPU::DeviceQuery() {
@@@ -169,20 -123,20 +160,18 @@@
    LOG(INFO) << "Warp size:                     " << prop.warpSize;
    LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
    LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
--  LOG(INFO) << "Maximum dimension of block:    "
--      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
--      << prop.maxThreadsDim[2];
--  LOG(INFO) << "Maximum dimension of grid:     "
--      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
--      << prop.maxGridSize[2];
++  LOG(INFO) << "Maximum dimension of block:    " << prop.maxThreadsDim[0]
++            << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2];
++  LOG(INFO) << "Maximum dimension of grid:     " << prop.maxGridSize[0] << ", "
++            << prop.maxGridSize[1] << ", " << prop.maxGridSize[2];
    LOG(INFO) << "Clock rate:                    " << prop.clockRate;
    LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
    LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
--  LOG(INFO) << "Concurrent copy and execution: "
--      << (prop.deviceOverlap ? "Yes" : "No");
++  LOG(INFO) << "Concurrent copy and execution: " << (prop.deviceOverlap ? "Yes"
++                                                                        : "No");
    LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
    LOG(INFO) << "Kernel execution timeout:      "
--      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
++            << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
    return;
  }
  
@@@ -203,6 -157,6 +192,5 @@@ int CudaGPU::FindDevice(const int start
    return -1;
  }
  
--
  }  // namespace singa
  #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/device/device.cc
----------------------------------------------------------------------
diff --cc src/core/device/device.cc
index 1889339,36381e4..6775e40
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@@ -22,11 -22,11 +22,11 @@@ namespace singa 
  Device::Device(int id, int num_executors, string scheduler, string vm)
      : id_(id), num_executors_(num_executors) {
        // TODO(wangwei) create scheduler and vm.
 -  host_ = &defaultDevice;
 +  host_ = defaultDevice;
  }
  
- void Device::Exec(function<void(Context*)>&& fn, const vector<Blob*> read_blobs,
-                     const vector<Blob*> write_blobs, bool use_rand_generator) {
+ void Device::Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
+                     const vector<Block*> write_blocks, bool use_rand_generator) {
    // TODO(wangwei) execute operations scheduled by the scheduler.
    DoExec(std::move(fn), 0);
  }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --cc src/core/memory/memory.cc
index 304c101,a1cf5db..7ac6792
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@@ -16,71 -16,5 +16,74 @@@
   * limitations under the License.
   */
  
--
  #include "singa/core/memory.h"
 +#include "singa/utils/logging.h"
++#include "singa/proto/core.pb.h"
 +#include <iostream>
 +
++#ifdef USE_CUDA
 +namespace singa {
- 
 +bool singa::CnMemPool::initialized = false;
 +std::mutex singa::CnMemPool::mtx;
- 
 +void CnMemPool::InitPool(int numDevices, size_t initSize, unsigned flag) {
- 	mtx.lock();
- 	if(!initialized) {
- 		CHECK_GE(numDevices, 1);
- 		cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
- 		for(int i = 0; i < numDevices; i++) {
- 			settingPtr[i].device = i;
- 			settingPtr[i].size = initSize;
- 			settingPtr[i].numStreams = 0;
- 			settingPtr[i].streams = NULL;
- 			settingPtr[i].streamSizes = 0;
- 		}
- 		cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
- 		CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
- 		delete[] settingPtr;
- 		initialized = true;
- 	}
- 	mtx.unlock();
++  mtx.lock();
++  const size_t kNBytesPerMB = (1u << 20);
++  if (!initialized) {
++    CHECK_GE(numDevices, 1);
++    cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
++    for (int i = 0; i < numDevices; i++) {
++      settingPtr[i].device = i;
++      settingPtr[i].size = initSize * kNBytesPerMB;
++      settingPtr[i].numStreams = 0;
++      settingPtr[i].streams = NULL;
++      settingPtr[i].streamSizes = 0;
++    }
++    cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
++    CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++        << " " << cnmemGetErrorString(status);
++    delete[] settingPtr;
++    initialized = true;
++  }
++  mtx.unlock();
 +}
 +
 +void CnMemPool::InitPool() {
- 		int defaultNumDevices = 1;
- 		size_t defaultSize = 1000000U;
- 		InitPool(defaultNumDevices,defaultSize,cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
++  MemPoolConf conf;
++  InitPool(conf.num_devices(), conf.alloc_size(),
++           cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
 +}
 +
 +CnMemPool::~CnMemPool() {
- 	mtx.lock();
- 	if(initialized) {
- 		cnmemStatus_t status = cnmemFinalize();
- 		CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
- 		initialized = false;
- 	}
- 	mtx.unlock();
++  mtx.lock();
++  if (initialized) {
++    cnmemStatus_t status = cnmemFinalize();
++    CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++        << " " << cnmemGetErrorString(status);
++    initialized = false;
++  }
++  mtx.unlock();
 +}
 +
- 
 +void CnMemPool::Malloc(void** ptr, const size_t size) {
- 	cnmemStatus_t status = cnmemMalloc(ptr,size,NULL);
- 	CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
++  cnmemStatus_t status = cnmemMalloc(ptr, size, NULL);
++  CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++      << " " << cnmemGetErrorString(status);
 +}
 +
 +void CnMemPool::Free(void* ptr) {
- 	cnmemStatus_t status = cnmemFree(ptr,NULL);
- 	CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
++  cnmemStatus_t status = cnmemFree(ptr, NULL);
++  CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
++      << " " << cnmemGetErrorString(status);
 +}
 +
 +void CudaMemPool::Malloc(void** ptr, const size_t size) {
- 	cudaError_t status = cudaMalloc(ptr,size);
- 	CHECK_EQ(status, cudaError_t::cudaSuccess);
++  cudaError_t status = cudaMalloc(ptr, size);
++  CHECK_EQ(status, cudaError_t::cudaSuccess);
 +}
 +
 +void CudaMemPool::Free(void* ptr) {
- 	cudaError_t status = cudaFree(ptr);
- 	CHECK_EQ(status, cudaError_t::cudaSuccess);
++  cudaError_t status = cudaFree(ptr);
++  CHECK_EQ(status, cudaError_t::cudaSuccess);
 +}
- 
 +}
++#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/math_kernel.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --cc src/core/tensor/tensor.cc
index a5b43d8,9b3eeff..b852a54
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@@ -25,54 -25,66 +25,65 @@@
  namespace singa {
  
  Tensor::~Tensor() {
-   if (blob_ != nullptr && blob_->DecRefCount() == 0)
-     device_->FreeBlob(blob_);
-   blob_ = nullptr;
 -  // LOG(ERROR) << "~";
+   if (block_ != nullptr && block_->DecRefCount() == 0)
+     device_->FreeBlock(block_);
+   block_ = nullptr;
  }
  
 -Tensor::Tensor() { device_ = &defaultDevice; }
 +Tensor::Tensor() { device_ = defaultDevice; }
  
 -Tensor::Tensor(const Shape &shape, const DataType dtype)
 -    : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
 -  device_ = &defaultDevice;
 +Tensor::Tensor(const Shape &shape, DataType dtype)
 +    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
 +  device_ = defaultDevice;
-   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+   block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
  }
 -Tensor::Tensor(Shape &&shape, const DataType dtype)
 -    : data_type_(dtype), device_(&defaultDevice), shape_(shape) {
 -  device_ = &defaultDevice;
 +Tensor::Tensor(Shape &&shape, DataType dtype)
 +    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
 +  device_ = defaultDevice;
-   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+   block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
  }
 -Tensor::Tensor(const Shape &shape, Device *device, const DataType dtype)
 +Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, DataType dtype)
      : data_type_(dtype), device_(device), shape_(shape) {
-   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
+   block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
  }
 -Tensor::Tensor(Shape &&shape, Device *device, const DataType dtype)
 +Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
      : data_type_(dtype), device_(device), shape_(shape) {
-   blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
- }
- Tensor::Tensor(const Tensor &t)
-     : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
-       blob_(t.blob()), shape_(t.shape_) {
-   blob_->IncRefCount();
-   // LOG(ERROR) << "const&";
- }
- 
- Tensor::Tensor(Tensor &&t)
-     : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
-       shape_(std::move(t.shape_)) {
-   blob_ = t.blob_;
-   t.blob_ = nullptr;
-   // LOG(ERROR) << "&&";
- }
- 
- void Tensor::ResetLike(const Tensor &t) {
-   if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
-     if (blob_ != nullptr && blob_->DecRefCount() == 0)
-       device_->FreeBlob(blob_);
-     shape_ = t.shape_;
-     device_ = t.device_;
-     data_type_ = t.data_type_;
-     blob_ = device_->NewBlob(t.MemSize());
+   block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
+ }
+ Tensor::Tensor(const Tensor &in)
+     : transpose_(in.transpose_),
+       data_type_(in.data_type_),
+       device_(in.device_),
+       block_(in.block()),
+       shape_(in.shape_) {
+   block_->IncRefCount();
+ }
+ 
+ Tensor::Tensor(Tensor &&in)
+     : transpose_(in.transpose_),
+       data_type_(in.data_type_),
+       device_(in.device_),
+       shape_(std::move(in.shape_)) {
+   block_ = in.block_;
+   in.block_ = nullptr;
+ }
+ 
+ void Tensor::SetBlock(Block* block) {
+   LOG(WARNING) << "Pls avoid using this function, which may have side-effect.";
+   if (block_ != nullptr)
+     if (block_->DecRefCount())
+       device_->FreeBlock(block_);
+   block_ = block;
+ }
+ 
+ void Tensor::ResetLike(const Tensor &in) {
+   if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+     if (block_ != nullptr && block_->DecRefCount() == 0)
+       device_->FreeBlock(block_);
+     shape_ = in.shape_;
+     device_ = in.device_;
+     data_type_ = in.data_type_;
+     block_ = device_->NewBlock(in.MemSize());
    }
  }
  
@@@ -228,13 -245,13 +244,13 @@@ void CopyDataToFrom(Tensor *dst, const 
    auto width = SizeOf(src.data_type());
    CHECK_EQ(width, SizeOf(dst->data_type()));
    size_t nBytes = num * width;
-   dst_offset *= width;
-   src_offset *= width;
-   CHECK_GE(src.MemSize(), src_offset + nBytes);
-   CHECK_GE(dst->MemSize(), dst_offset + nBytes);
+   auto d_offset = dst_offset * width;
+   auto s_offset = src_offset * width;
+   CHECK_GE(src.MemSize(), s_offset + nBytes);
+   CHECK_GE(dst->MemSize(), d_offset + nBytes);
  
 -  Device *src_dev = src.device(), *dst_dev = dst->device();
 +  std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
-   Blob *from = src.blob(), *to = dst->blob();
+   Block *from = src.block(), *to = dst->block();
    if (dst_dev->lang() != src_dev->lang()) {
      // let the none cpp device conduct copy op
      if (dst_dev->lang() == kCpp) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/batchnorm.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/batchnorm.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_activation.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_batchnorm.cc
index 8288a41,0e597fe..a1e9e50
--- a/src/model/layer/cudnn_batchnorm.cc
+++ b/src/model/layer/cudnn_batchnorm.cc
@@@ -30,7 -30,7 +30,7 @@@ CudnnBatchNorm::~CudnnBatchNorm() 
    }
  }
  
--void CudnnBatchNorm::ToDevice(Device* device) {
++void CudnnBatchNorm::ToDevice(std::shared_ptr<Device> device) {
    BatchNorm::ToDevice(device);
    resultSaveMean_.ToDevice(device);
    resultSaveVariance_.ToDevice(device);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_batchnorm.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_batchnorm.h
index 8598b65,36dbbce..4f46452
--- a/src/model/layer/cudnn_batchnorm.h
+++ b/src/model/layer/cudnn_batchnorm.h
@@@ -29,31 -29,29 +29,29 @@@
  namespace singa {
  class CudnnBatchNorm : public BatchNorm {
   public:
-    ~CudnnBatchNorm();
-    /// \copy doc Layer::layer_type()
-    const std::string layer_type() const override {
-      return "CudnnBatchNorm";
-    }
+   ~CudnnBatchNorm();
+   /// \copy doc Layer::layer_type()
+   const std::string layer_type() const override { return "CudnnBatchNorm"; }
  
-    void Setup(const LayerConf& conf) override;
+   void Setup(const Shape& in_sample, const LayerConf& conf) override;
  
-    const Tensor Forward(int flag, const Tensor& input)
-      override;
-    const std::pair<Tensor, vector<Tensor>> Backward(
-        int flag, const Tensor& grad) override;
+   const Tensor Forward(int flag, const Tensor& input) override;
+   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                    const Tensor& grad) override;
 -  void ToDevice(Device* device) override;
++  void ToDevice(std::shared_ptr<Device> device) override;
  
-    /// Init cudnn related data structures.
-    void InitCudnn(const Shape& shape, DataType dtype);
-    void ToDevice(Device* device) override;
+  private:
+   /// Init cudnn related data structures.
+   void InitCudnn(const Shape& shape, DataType dtype);
  
   private:
-    bool has_init_cudnn_ = false;
-    cudnnBatchNormMode_t mode_;
-    cudnnLRNDescriptor_t lrn_desc_;
-    cudnnTensorDescriptor_t shape_desc_, param_desc_;
-    Tensor resultSaveMean_, resultSaveVariance_;
-    
- }; // class CudnnBatchNorm
+   bool has_init_cudnn_ = false;
+   cudnnBatchNormMode_t mode_;
+   cudnnLRNDescriptor_t lrn_desc_ = nullptr;
+   cudnnTensorDescriptor_t shape_desc_ = nullptr, param_desc_ = nullptr;
+   Tensor resultSaveMean_, resultSaveVariance_;
+ 
+ };  // class CudnnBatchNorm
  }  // namespace
  
  #endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_convolution.cc
index b80c3bd,8cdfc07..d5ac2a3
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@@ -46,7 -46,7 +46,7 @@@ void CudnnConvolution::Setup(const Shap
           "limited_workspace, no_workspace and autotune";
  }
  
--void CudnnConvolution::ToDevice(Device *device) {
++void CudnnConvolution::ToDevice(std::shared_ptr<Device> device) {
    weight_.ToDevice(device);
    bias_.ToDevice(device);
    workspace_.ToDevice(device);
@@@ -55,7 -55,7 +55,7 @@@
  void CudnnConvolution::InitCudnn(const Tensor &input) {
    CHECK(!has_init_cudnn_);
    DataType dtype = input.data_type();
--  Device *dev = input.device();
++  auto dev = input.device();
    Context *ctx = dev->context(0);
    size_t batchsize = input.shape(0);
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
@@@ -161,7 -161,7 +161,7 @@@ const Tensor CudnnConvolution::Forward(
    if (flag & kTrain) buf_.push(input);  // buffer the input for backward
    size_t batchsize = input.shape()[0];
    DataType dtype = input.data_type();
--  Device *dev = input.device();
++  auto dev = input.device();
  
    if (!has_init_cudnn_) InitCudnn(input);
  

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_convolution.h
index 152d797,6c15839..cd0471f
--- a/src/model/layer/cudnn_convolution.h
+++ b/src/model/layer/cudnn_convolution.h
@@@ -41,9 -41,9 +41,9 @@@ class CudnnConvolution : public Convolu
                                                     const Tensor &grad) override;
  
    /// \copydoc Layer::Setup(const LayerConf&);
-   void Setup(const LayerConf &conf) override;
+   void Setup(const Shape& in_sample, const LayerConf &conf) override;
  
--  void ToDevice(Device *device) override;
++  void ToDevice(std::shared_ptr<Device> device) override;
  
    size_t workspace_byte_limit() { return workspace_byte_limit_; }
    string prefer() { return prefer_; }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_dropout.cc
index 64a581b,877dd12..2e2e12b
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@@ -34,8 -34,8 +34,8 @@@ CudnnDropout::~CudnnDropout() 
    if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
  }
  
--void CudnnDropout::InitCudnn(int size, DataType dtype, Device* dev,
--                             Context* ctx) {
++void CudnnDropout::InitCudnn(int size, DataType dtype,
++                             std::shared_ptr<Device> dev, Context* ctx) {
    CHECK(!has_init_cudnn_);
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
    CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
@@@ -65,13 -65,11 +65,11 @@@ const Tensor CudnnDropout::Forward(int 
    if (flag & kTrain) {
      auto size = input.Size();
      DataType dtype = input.data_type();
--    Device* dev = input.device();
++    auto dev = input.device();
      if (!has_init_cudnn_) {
-       input.device()->Exec(
-           [size, dtype, this, dev](Context* ctx) {
-             this->InitCudnn(size, dtype, dev, ctx);
-           },
-           {}, {this->state_.blob()});
+       input.device()->Exec([size, dtype, this, dev](Context* ctx) {
+         this->InitCudnn(size, dtype, dev, ctx);
+       }, {}, {this->state_.block()});
      }
      Tensor output;
      output.ResetLike(input);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_dropout.h
index da3d1d2,83572cf..6809653
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@@ -42,8 -42,9 +42,10 @@@ class CudnnDropout : public Dropout 
    const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                     const Tensor& grad) override;
  
+  private:
    /// Init cudnn related data structures.
--  void InitCudnn(int size, DataType dtype, Device* dev, Context* ctx);
++  void InitCudnn(int size, DataType dtype, std::shared_ptr<Device> dev,
++                 Context* ctx);
  
   private:
    bool has_init_cudnn_ = false;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_lrn.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --cc src/model/layer/cudnn_pooling.cc
index 842685d,9d288c0..6d7a5b1
--- a/src/model/layer/cudnn_pooling.cc
+++ b/src/model/layer/cudnn_pooling.cc
@@@ -82,7 -82,7 +82,7 @@@ const Tensor CudnnPooling::Forward(int 
    CHECK_EQ(input.nDim(), 4u);
    size_t batchsize = input.shape(0);
    DataType dtype = input.data_type();
--  Device *dev = input.device();
++  auto dev = input.device();
    if (!has_init_cudnn_) InitCudnn(input);
  
    Shape shape{batchsize, channels_, pooled_height_, pooled_width_};

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dense.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dense.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dropout.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/dropout.h
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/prelu.cc
----------------------------------------------------------------------
diff --cc src/model/layer/prelu.cc
index 0000000,83a56fa..6eb09d9
mode 000000,100644..100644
--- a/src/model/layer/prelu.cc
+++ b/src/model/layer/prelu.cc
@@@ -1,0 -1,145 +1,145 @@@
+ /**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+ 
+ #include "singa/model/layer.h"
+ #include "./prelu.h"
+ namespace singa {
+ 
+ void PReLU::Setup(const Shape& in_sample, const LayerConf &conf) {
+   Layer::Setup(in_sample, conf);
+   out_sample_shape_ = in_sample;
+   channel_shared_ = conf.prelu_conf().channel_shared();
+   format_ = conf.prelu_conf().format();
+   // Push back params into param_values_
+   for (const auto &spec : conf.param()) param_specs_.push_back(spec);
+   param_values_.push_back(&a_);
+ }
+ 
+ const Tensor PReLU::Forward(int flag, const Tensor &input) {
+   Tensor output;
+   if (!channel_shared_) {
+     size_t n, c, h, w;
+     Tensor temp = (input <= 0.f);
+     if (temp.nDim() == 4) {
+       if (format_ == "NCHW") {
+         n = temp.shape(0);
+         c = temp.shape(1);
+         h = temp.shape(2);
+         w = temp.shape(3);
+         temp.Reshape(Shape{n * c, h * w});
+         Tensor temp_a(Shape{n, c}, input.device(), input.data_type());
+         Uniform(1.f, 1.f, &temp_a);
+         MultRow(a_, &temp_a);
+         temp_a.Reshape(Shape{n * c});
+         MultColumn(temp_a, &temp);
+       } else if (format_ == "NHWC") {
+         n = temp.shape(0);
+         h = temp.shape(1);
+         w = temp.shape(2);
+         c = temp.shape(3);
+         temp.Reshape(Shape{n * h * w, c});
+         MultRow(a_, &temp);
+       } else {
+         LOG(FATAL) << "Incorrect input format for prelu layer.";
+       }
+     } else {
+       LOG(FATAL) << "Incorrect input format for prelu layer.";
+     }
+     output = input * ((input > 0.f) + temp);
+   } else {
+     // share the first param of Tensor A along all channels
+     LOG(FATAL) << "Not implemented";
+   // TODO(wangwei) cannot access the data in this way. The data could be on GPU.
+     auto a = a_.data<float>()[0];
+     output = input * ((input > 0.f) + (input <= 0.f) * a);
+   }
+   if (flag & kTrain) buf_.push(input);
+   return output;
+ }
+ 
+ const std::pair<Tensor, vector<Tensor> > PReLU::Backward(int flag,
+                                                          const Tensor &grad) {
+   vector<Tensor> param_grad;
+   CHECK(!buf_.empty());
+   Tensor input_grad, input = buf_.top();
+   buf_.pop();
+   Tensor da;
+   da.ResetLike(a_);
+   if (!channel_shared_) {
+     size_t n, c, h, w;
+     Tensor temp1 = (input <= 0.f);
+     if (temp1.nDim() == 4) {
+       if (format_ == "NCHW") {
+         n = temp1.shape(0);
+         c = temp1.shape(1);
+         h = temp1.shape(2);
+         w = temp1.shape(3);
+         temp1.Reshape(Shape{n * c, h * w});
+         Tensor temp_a(Shape{n, c}, grad.device(), grad.data_type());
+         Uniform(1.f, 1.f, &temp_a);
+         MultRow(a_, &temp_a);
+         temp_a.Reshape(Shape{n * c});
+         MultColumn(temp_a, &temp1);
+         temp1.Reshape(Shape{n, c, h, w});
+       } else if (format_ == "NHWC") {
+         n = temp1.shape(0);
+         h = temp1.shape(1);
+         w = temp1.shape(2);
+         c = temp1.shape(3);
+         temp1.Reshape(Shape{n * h * w, c});
+         MultRow(a_, &temp1);
+         temp1.Reshape(Shape{n, h, w, c});
+       } else {
+         LOG(FATAL) << "Incorrect input format for prelu layer.";
+       }
+     } else {
+       LOG(FATAL) << "Incorrect input format for prelu layer.";
+     }
+     input_grad = grad * input * ((input > 0.f) + temp1);
+     Tensor temp2 = grad * input * (input <= 0.f);
+     if (format_ == "NCHW") {
+       Tensor temp3(Shape{n * c}, grad.device(), grad.data_type());
+       temp2.Reshape(Shape{n * c, h * w});
+       SumColumns(temp2, &temp3);
+       temp3.Reshape(Shape{n, c});
+       SumRows(temp3, &da);
+     } else if (format_ == "NHWC") {
+       temp2.Reshape(Shape{n * h * w, c});
+       SumRows(temp2, &da);
+     }
+   } else {
+     // share the first param of Tensor A along all channels
+     LOG(FATAL) << "Not Implemented";
+     // TODO(wangwei) cannot access the data in this way. The data could be on GPU.
+     auto a = a_.data<float>()[0];
+     input_grad = grad * input * ((input > 0.f) + (input <= 0.f) * a);
+     Tensor temp = grad * input * (input <= 0.f);
+     float sum = Sum<float>(temp);
+     Uniform(1.f, 1.f, &da);
+     da *= sum;
+   }
+   param_grad.push_back(da);
+   return std::make_pair(input_grad, param_grad);
+ }
+ 
 -void PReLU::ToDevice(Device *device) {
++void PReLU::ToDevice(std::shared_ptr<Device> device) {
+   Layer::ToDevice(device);
+   a_.ToDevice(device);
+ }
+ 
+ } // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/model/layer/prelu.h
----------------------------------------------------------------------
diff --cc src/model/layer/prelu.h
index 0000000,ee571e1..70a9dcf
mode 000000,100644..100644
--- a/src/model/layer/prelu.h
+++ b/src/model/layer/prelu.h
@@@ -1,0 -1,66 +1,66 @@@
+ /**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+ #ifndef SINGA_MODEL_LAYER_PRELU_H_
+ #define SINGA_MODEL_LAYER_PRELU_H_
+ #include <utility>
+ #include <string>
+ #include <vector>
+ #include "singa/model/layer.h"
 -#include "singa_config.h"
++#include "singa/singa_config.h"
+ 
+ namespace singa {
+ class PReLU : public Layer {
+  public:
+   /// \copydoc Layer::layer_type()
+   const std::string layer_type() const override { return "PReLU"; }
+ 
+ 
+   /// \copydoc Layer::Setup(const LayerConf&);
+   void Setup(const Shape& in_sample, const LayerConf& conf) override;
+   const Shape GetOutputSampleShape() const override {
+     CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+     return out_sample_shape_;
+   }
+ 
+   /// \copydoc Layer::Forward(int flag, const Tensor&)
+   const Tensor Forward(int flag, const Tensor &input) override;
+ 
+   /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+   const std::pair<Tensor, vector<Tensor> > Backward(
+       int flag, const Tensor &grad) override;
+ 
 -  void ToDevice(Device *device);
++  void ToDevice(std::shared_ptr<Device> device);
+ 
+   const bool Channel_shared() const { return channel_shared_; }
+   const Tensor A() const { return a_; }
+   const std::string Format() const { return format_; }
+ 
+   void Set_a(Tensor a) {
+     a_.ResetLike(a);
+     a_.CopyData(a);
+   }
+ 
+  protected:
+   bool channel_shared_;
+   std::string format_;  // format_ has two valid value, i.e. NCHW, NHWC
+   Tensor a_;            // shape of a_ is 2D, i.e. (channels, 1)
+   std::stack<Tensor> buf_;
+   Shape out_sample_shape_;
+ };
+ }  // namespace singa
+ #endif  // SINGA_MODEL_LAYER_PRELU_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/src/proto/core.proto
----------------------------------------------------------------------
diff --cc src/proto/core.proto
index cf6e193,3031359..b853b30
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@@ -44,16 -45,3 +45,16 @@@ enum CopyDirection 
    kDeviceToDevice = 3;
    kNumDirection = 4;
  }
 +
 +// configuration for device memory pool
 +message MemPoolConf {
 +	optional string type = 1 [default = "cnmem"];
 +	optional uint32 num_devices = 2 [default = 1];
- 	// allocation size for each device
- 	optional uint32 alloc_size = 3 [default = 10000000];
++	// allocation size for each device, default is 256 MB
++	optional uint32 alloc_size = 3 [default = 256];
 +	// memory manager flag for cnmem
 +	// cnmemflag = 0: default flag
 +	// cnmemflag = 1: prevent the manager from growing its memory consumption
 +	// cnmemflag = 2: prevent the manager from stealing memory
 +	optional uint32 cnmemflag = 4 [default = 0];
 +}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_adagrad.cc
----------------------------------------------------------------------
diff --cc test/singa/test_adagrad.cc
index 0000000,642e929..c45dcef
mode 000000,100644..100644
--- a/test/singa/test_adagrad.cc
+++ b/test/singa/test_adagrad.cc
@@@ -1,0 -1,96 +1,96 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
 -#include "singa_config.h"
++#include "singa/singa_config.h"
+ #include <cmath>
+ 
+ TEST(Adagrad, ApplyCPU) {
+   singa::Adagrad adagrad;
+   float lr = 0.1f;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   singa::OptimizerConf conf;
+   adagrad.Setup(conf);
+   adagrad.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   const float* newv1 = v1.data<float>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+                 1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   adagrad.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+ 
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv2[i],
+                 newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(Adagrad, ApplyCUDA) {
+   singa::Adagrad adagrad;
+   float lr = 0.1f;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
 -  singa::CudaGPU dev;
 -  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   singa::OptimizerConf conf;
+   adagrad.Setup(conf);
+   adagrad.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   v1.ToHost();
+   const float* newv1 = v1.data<float>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+                 1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   adagrad.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   v2.ToHost();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+ 
+   for (int i = 0; i < 4; ++i)
+     EXPECT_FLOAT_EQ(newv2[i],
+                     newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()));
+ }
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cross_entropy.cc
index 0000000,ce60f7c..d73591f
mode 000000,100644..100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@@ -1,0 -1,116 +1,116 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "gtest/gtest.h"
+ #include "singa/core/tensor.h"
+ #include "singa/core/device.h"
+ #include "singa/model/loss.h"
 -#include "singa_config.h"
++#include "singa/singa_config.h"
+ 
+ using singa::Tensor;
+ class TestSoftmaxCrossEntropy : public ::testing::Test {
+  protected:
+   virtual void SetUp() {
+     p.Reshape(singa::Shape{2, 4});
+     t.Reshape(singa::Shape{2, 1});
+   }
+   const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+   const int tdat[2] = {0, 2};
+ 
+   singa::Tensor p, t;
+ };
+ 
+ TEST_F(TestSoftmaxCrossEntropy, CppForward) {
+   p.CopyDataFromHostPtr(pdat, 8);
+   t.AsType(singa::kInt);
+   t.CopyDataFromHostPtr(tdat, 2);
+ 
+   singa::SoftmaxCrossEntropy cross_entropy;
+   const Tensor& loss = cross_entropy.Forward(p, t);
+   auto ldat = loss.data<float>();
+ 
+   const float result_test = -log(0.25);
+   EXPECT_FLOAT_EQ(ldat[0], result_test);
+   EXPECT_FLOAT_EQ(ldat[1], result_test);
+ }
+ 
+ TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
+   p.CopyDataFromHostPtr(pdat, 8);
+   t.AsType(singa::kInt);
+   t.CopyDataFromHostPtr(tdat, 2);
+ 
+   singa::SoftmaxCrossEntropy cross_entropy;
+   cross_entropy.Forward(p, t);
+   const Tensor& grad = cross_entropy.Backward();
+ 
+   auto gdat = grad.data<float>();
+   EXPECT_FLOAT_EQ(gdat[0], -0.75);
+   EXPECT_FLOAT_EQ(gdat[1], 0.25);
+   EXPECT_FLOAT_EQ(gdat[2], 0.25);
+   EXPECT_FLOAT_EQ(gdat[3], 0.25);
+   EXPECT_FLOAT_EQ(gdat[4], 0.25);
+   EXPECT_FLOAT_EQ(gdat[5], 0.25);
+   EXPECT_FLOAT_EQ(gdat[6], -0.75);
+   EXPECT_FLOAT_EQ(gdat[7], 0.25);
+ }
+ 
+ #ifdef USE_CUDA
+ 
+ TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
+   singa::SoftmaxCrossEntropy cross_entropy;
 -  singa::CudaGPU dev;
 -  p.ToDevice(&dev);
 -  t.ToDevice(&dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  p.ToDevice(dev);
++  t.ToDevice(dev);
+   p.CopyDataFromHostPtr(pdat, 8);
+   t.CopyDataFromHostPtr(tdat, 2);
+ 
+   Tensor loss = cross_entropy.Forward(p, t);
+   loss.ToHost();
+   auto ldat = loss.data<float>();
+ 
+   const float result_test = -log(0.25);
+   EXPECT_FLOAT_EQ(ldat[0], result_test);
+   EXPECT_FLOAT_EQ(ldat[1], result_test);
+ }
+ 
+ TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
+   singa::SoftmaxCrossEntropy cross_entropy;
 -  singa::CudaGPU dev;
 -  p.ToDevice(&dev);
 -  t.ToDevice(&dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  p.ToDevice(dev);
++  t.ToDevice(dev);
+   p.CopyDataFromHostPtr(pdat, 8);
+   t.CopyDataFromHostPtr(tdat, 2);
+ 
+   cross_entropy.Forward(p, t);
+   Tensor grad = cross_entropy.Backward();
+ 
+   grad.ToHost();
+   auto gdat = grad.data<float>();
+   EXPECT_FLOAT_EQ(gdat[0], -0.75);
+   EXPECT_FLOAT_EQ(gdat[1], 0.25);
+   EXPECT_FLOAT_EQ(gdat[2], 0.25);
+   EXPECT_FLOAT_EQ(gdat[3], 0.25);
+   EXPECT_FLOAT_EQ(gdat[4], 0.25);
+   EXPECT_FLOAT_EQ(gdat[5], 0.25);
+   EXPECT_FLOAT_EQ(gdat[6], -0.75);
+   EXPECT_FLOAT_EQ(gdat[7], 0.25);
+ }
+ #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_activation.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_activation.cc
index bed7715,940c6b9..1a619e7
--- a/test/singa/test_cudnn_activation.cc
+++ b/test/singa/test_cudnn_activation.cc
@@@ -46,8 -46,8 +46,8 @@@ TEST(TCudnnActivation, Setup) 
  TEST(TCudnnActivation, Forward) {
    const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
    size_t n = sizeof(x) / sizeof(float);
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{n}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{n}, cuda);
    in.CopyDataFromHostPtr<float>(x, n);
  
    float neg_slope = 0.5f;
@@@ -66,9 -65,9 +65,8 @@@
  
      singa::Tensor out = acti.Forward(singa::kTrain, in);
      EXPECT_EQ(n, out.Size());
--    singa::CppCPU host(0, 1);
--    out.ToDevice(&host);
-     const float* yptr = out.data<const float*>();
++    out.ToHost();
+     const float* yptr = out.data<float>();
      float* y = new float[n];
      if (acti.Mode() == "SIGMOID") {
        for (size_t i = 0; i < n; i++) y[i] = 1.f / (1.f + exp(-x[i]));
@@@ -87,8 -86,8 +85,8 @@@
  TEST(TCudnnActivation, Backward) {
    const float x[] = {2.0f, 3.0f, 3.0f, 7.f, 0.0f, 5.0, 1.5, 2.5, -2.5, 1.5};
    size_t n = sizeof(x) / sizeof(float);
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{n}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{n}, cuda);
    in.CopyDataFromHostPtr<float>(x, n);
    float neg_slope = 0.5f;
    std::string types[] = {"SIGMOID", "TANH", "RELU"};
@@@ -101,22 -100,21 +99,20 @@@
        singa::ReLUConf* reluconf = conf.mutable_relu_conf();
        reluconf->set_negative_slope(neg_slope);
      }
-     acti.Setup(conf);
-     acti.InitCudnn(n, singa::kFloat32);
+     acti.Setup(Shape{n}, conf);
      singa::Tensor out = acti.Forward(singa::kTrain, in);
      EXPECT_EQ(n, out.Size());
--    singa::CppCPU host(0, 1);
--    out.ToDevice(&host);
-     const float* yptr = out.data<const float*>();
++    out.ToHost();
+     const float* yptr = out.data<float>();
  
      const float grad[] = {2.0f, 1.0f, 2.0f, 0.0f, -2.0f,
                            -1.0, 1.5,  2.5,  -1.5, -2.5};
--    singa::Tensor out_diff(singa::Shape{n}, &cuda);
++    singa::Tensor out_diff(singa::Shape{n}, cuda);
      out_diff.CopyDataFromHostPtr<float>(grad, n);
      const auto ret = acti.Backward(singa::kTrain, out_diff);
      singa::Tensor in_diff = ret.first;
--    in_diff.ToDevice(&host);
-     const float* xptr = in_diff.data<const float*>();
++    in_diff.ToHost();
+     const float* xptr = in_diff.data<float>();
      float* dx = new float[n];
      if (acti.Mode() == "SIGMOID") {
        for (size_t i = 0; i < n; i++) dx[i] = grad[i] * yptr[i] * (1. - yptr[i]);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_batchnorm.cc
index d38fdaa,b3b6477..7067b16
--- a/test/singa/test_cudnn_batchnorm.cc
+++ b/test/singa/test_cudnn_batchnorm.cc
@@@ -56,34 -53,31 +53,30 @@@ TEST(CudnnBatchNorm, Forward) 
      0.150676, 0.153442, -0.0929899, -0.148675,
      -0.112459, -0.106284, -0.103074, -0.0668811
    };
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{1,2,4,4}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
    in.CopyDataFromHostPtr(x, 1*2*4*4);
    const float alpha_[] = {1, 1};
--  singa::Tensor alpha(singa::Shape{1,2,1,1}, &cuda);
++  singa::Tensor alpha(singa::Shape{1,2,1,1}, cuda);
    alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
  
    const float beta_[] = {0, 0};
--  singa::Tensor beta(singa::Shape{1,2,1,1}, &cuda);
++  singa::Tensor beta(singa::Shape{1,2,1,1}, cuda);
    beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
  
    singa::LayerConf conf;
    singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
    batchnorm_conf->set_factor(0.9);
-   batchnorm_conf->set_channels(2);
-   batchnorm_conf->set_height(4);
-   batchnorm_conf->set_width(4);
-   batchnorm.Setup(conf);
+   batchnorm.Setup(Shape{2, 4, 4}, conf);
  
--  batchnorm.ToDevice(&cuda);
++  batchnorm.ToDevice(cuda);
    batchnorm.set_bnScale(alpha);
    batchnorm.set_bnBias(beta);
    batchnorm.set_runningMean(beta);
    batchnorm.set_runningVariance(beta);
    singa::Tensor out = batchnorm.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
    out.ToHost();
-   const float *outptr = out.data<const float *>();
+   const float *outptr = out.data<float>();
    const auto & shape = out.shape();
    EXPECT_EQ(4u, shape.size());
    EXPECT_EQ(1u, shape[0]);
@@@ -136,8 -130,8 +129,8 @@@ TEST(CudnnBatchNorm, Backward) 
      0.150676, 0.153442, -0.0929899, -0.148675,
      -0.112459, -0.106284, -0.103074, -0.0668811
    };
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor x_tensor(singa::Shape{1,2,4,4}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
    x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
  
    singa::LayerConf conf;
@@@ -159,35 -150,35 +149,34 @@@
      0.00468428, 0.00735506, -0.00682525, 0.00342023
    };
  
--  singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, &cuda);
++  singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, cuda);
    dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
    const float alpha_[] = {1, 1};
--  singa::Tensor alpha(singa::Shape{1,2,1,1}, &cuda);
++  singa::Tensor alpha(singa::Shape{1,2,1,1}, cuda);
    alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
  
    const float beta_[] = {0, 0};
--  singa::Tensor beta(singa::Shape{1,2,1,1}, &cuda);
++  singa::Tensor beta(singa::Shape{1,2,1,1}, cuda);
    beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
  
    const float mean_[] = {0.0123405, -0.0622333};
--  singa::Tensor mean(singa::Shape{1,2,1,1}, &cuda);
++  singa::Tensor mean(singa::Shape{1,2,1,1}, cuda);
    mean.CopyDataFromHostPtr(mean_, 1*2*1*1);
  
    const float var_[] = {15.9948, 8.68198};
--  singa::Tensor var(singa::Shape{1,2,1,1}, &cuda);
++  singa::Tensor var(singa::Shape{1,2,1,1}, cuda);
    var.CopyDataFromHostPtr(var_, 1*2*1*1);
  
--  batchnorm.ToDevice(&cuda);
++  batchnorm.ToDevice(cuda);
    batchnorm.set_bnScale(alpha);
    batchnorm.set_bnBias(beta);
    batchnorm.set_runningMean(beta);
    batchnorm.set_runningVariance(beta);
    batchnorm.Forward(singa::kTrain, x_tensor);
    const auto ret = batchnorm.Backward(singa::kTrain, dy_tensor);
--  singa::CppCPU host(0, 1);
    singa::Tensor dx = ret.first;
--  dx.ToDevice(&host);
-   const float *dxptr = dx.data<const float *>();
++  dx.ToHost();
+   const float *dxptr = dx.data<float>();
    const auto & shape = dx.shape();
    EXPECT_EQ(4u, shape.size());
    EXPECT_EQ(1u, shape[0]);
@@@ -228,8 -219,8 +217,8 @@@
    EXPECT_NEAR(0.0217477, dxptr[31], 1e-4f);
  
    singa::Tensor dbnScale = ret.second.at(0);
--  dbnScale.ToDevice(&host);
-   const float *dbnScaleptr = dbnScale.data<const float *>();
++  dbnScale.ToHost();
+   const float *dbnScaleptr = dbnScale.data<float>();
    const auto & dbnScaleShape = dbnScale.shape();
    EXPECT_EQ(4u, dbnScaleShape.size());
    EXPECT_EQ(1u, dbnScaleShape[0]);
@@@ -241,8 -232,8 +230,8 @@@
    EXPECT_NEAR(-0.00219431f, dbnScaleptr[1], 1e-4f);
  
    singa::Tensor dbnBias = ret.second.at(1);
--  dbnBias.ToDevice(&host);
-   const float *dbnBiasptr = dbnBias.data<const float *>();
++  dbnBias.ToHost();
+   const float *dbnBiasptr = dbnBias.data<float>();
    const auto & dbnBiasShape = dbnBias.shape();
    EXPECT_EQ(4u, dbnBiasShape.size());
    EXPECT_EQ(1u, dbnBiasShape[0]);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_convolution.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_convolution.cc
index 2a17da2,44077b7..3b84645
--- a/test/singa/test_cudnn_convolution.cc
+++ b/test/singa/test_cudnn_convolution.cc
@@@ -65,18 -63,18 +63,18 @@@ TEST(CudnnConvolution, Forward) 
    const size_t batchsize = 1, c = 1, h = 3, w = 3;
    const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                            6.0f, 7.0f, 8.0f, 9.0f};
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
    in.CopyDataFromHostPtr(x, batchsize * c * h * w);
  
    // Set weight and bias manually
    const size_t num_filters = 1;
    const float we[num_filters * batchsize * h * w] = {
        1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
--  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
++  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, cuda);
    weight.CopyDataFromHostPtr(we, batchsize * h * w);
    const float b[num_filters] = {1.0f};
--  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++  singa::Tensor bias(singa::Shape{num_filters}, cuda);
    bias.CopyDataFromHostPtr(b, num_filters);
    CudnnConvolution conv;
    conv.set_weight(weight);
@@@ -102,9 -97,9 +97,8 @@@
  
    // Parameter "flag" does not influence convolution
    singa::Tensor out1 = conv.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
--  out1.ToDevice(&host);
-   const float *outptr1 = out1.data<const float *>();
++  out1.ToHost();
+   const float *outptr1 = out1.data<float>();
    // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
    EXPECT_EQ(4u, out1.Size());
  
@@@ -119,8 -114,8 +113,8 @@@ TEST(CudnnConvolution, Backward) 
    const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
    const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                                    6.0f, 7.0f, 8.0f, 9.0f};
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
    in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
  
    // Set weight_ and bias_ manually
@@@ -128,10 -123,10 +122,10 @@@
    const float we[num_filters * batchsize * src_h * src_w] = {
        1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
    singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
--                       &cuda);
++                       cuda);
    weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
    const float b[num_filters] = {1.0f};
--  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++  singa::Tensor bias(singa::Shape{num_filters}, cuda);
    bias.CopyDataFromHostPtr(b, num_filters);
    CudnnConvolution conv;
    conv.set_weight(weight);
@@@ -162,14 -154,14 +153,13 @@@
    const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
                                                                 0.4f};
    singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
--                     &cuda);
++                     cuda);
    grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
  
    const auto ret = conv.Backward(singa::kTrain, grad);
--  singa::CppCPU host(0, 1);
    singa::Tensor in_grad = ret.first;
--  in_grad.ToDevice(&host);
-   const float *dx = in_grad.data<const float *>();
++  in_grad.ToHost();
+   const float *dx = in_grad.data<float>();
    const float *wptr = we;
    EXPECT_EQ(9u, in_grad.Size());
    EXPECT_EQ(dy[0] * wptr[4], dx[0]);
@@@ -186,12 -178,12 +176,12 @@@
  
    singa::Tensor dw = ret.second[0];
    singa::Tensor db = ret.second[1];
--  dw.ToDevice(&host);
--  db.ToDevice(&host);
-   const float *dbptr = db.data<const float *>();
++  dw.ToHost();
++  db.ToHost();
+   const float *dbptr = db.data<float>();
    EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
  
-   const float *dwptr = dw.data<const float *>();
+   const float *dwptr = dw.data<float>();
    EXPECT_EQ(9u, dw.Size());
    EXPECT_EQ(dy[3] * x[4], dwptr[0]);
    EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
@@@ -246,18 -235,18 +233,19 @@@ TEST(CudnnConvolution_AT, Forward) 
    const size_t batchsize = 1, c = 1, h = 3, w = 3;
    const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                            6.0f, 7.0f, 8.0f, 9.0f};
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
    in.CopyDataFromHostPtr(x, batchsize * c * h * w);
  
    // Set weight and bias manually
    const size_t num_filters = 1;
    const float we[num_filters * batchsize * h * w] = {
        1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
--  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, &cuda);
++  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, cuda);
    weight.CopyDataFromHostPtr(we, batchsize * h * w);
    const float b[num_filters] = {1.0f};
--  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++  singa::Tensor bias(singa::Shape{num_filters}, cuda);
    bias.CopyDataFromHostPtr(b, num_filters);
    CudnnConvolution conv;
    conv.set_weight(weight);
@@@ -283,9 -269,9 +268,8 @@@
  
    // Parameter "flag" does not influence convolution
    singa::Tensor out1 = conv.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
--  out1.ToDevice(&host);
-   const float *outptr1 = out1.data<const float *>();
++  out1.ToHost();
+   const float *outptr1 = out1.data<float>();
    // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
    EXPECT_EQ(4u, out1.Size());
  
@@@ -300,8 -286,8 +284,9 @@@ TEST(CudnnConvolution_AT, Backward) 
    const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
    const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                                    6.0f, 7.0f, 8.0f, 9.0f};
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
    in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
  
    // Set weight_ and bias_ manually
@@@ -309,10 -295,10 +294,10 @@@
    const float we[num_filters * batchsize * src_h * src_w] = {
        1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
    singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
--                       &cuda);
++                       cuda);
    weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
    const float b[num_filters] = {1.0f};
--  singa::Tensor bias(singa::Shape{num_filters}, &cuda);
++  singa::Tensor bias(singa::Shape{num_filters}, cuda);
    bias.CopyDataFromHostPtr(b, num_filters);
    CudnnConvolution conv;
    conv.set_weight(weight);
@@@ -343,14 -326,14 +325,13 @@@
    const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
                                                                 0.4f};
    singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
--                     &cuda);
++                     cuda);
    grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
  
    const auto ret = conv.Backward(singa::kTrain, grad);
--  singa::CppCPU host(0, 1);
    singa::Tensor in_grad = ret.first;
--  in_grad.ToDevice(&host);
-   const float *dx = in_grad.data<const float *>();
++  in_grad.ToHost();
+   const float *dx = in_grad.data<float>();
    const float *wptr = we;
    EXPECT_EQ(9u, in_grad.Size());
    EXPECT_EQ(dy[0] * wptr[4], dx[0]);
@@@ -367,12 -350,12 +348,12 @@@
  
    singa::Tensor dw = ret.second[0];
    singa::Tensor db = ret.second[1];
--  dw.ToDevice(&host);
--  db.ToDevice(&host);
-   const float *dbptr = db.data<const float *>();
++  dw.ToHost();
++  db.ToHost();
+   const float *dbptr = db.data<float>();
    EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
  
-   const float *dwptr = dw.data<const float *>();
+   const float *dwptr = dw.data<float>();
    EXPECT_EQ(9u, dw.Size());
    EXPECT_EQ(dy[3] * x[4], dwptr[0]);
    EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_dropout.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_dropout.cc
index 32572d0,419dd0c..d06a254
--- a/test/singa/test_cudnn_dropout.cc
+++ b/test/singa/test_cudnn_dropout.cc
@@@ -48,8 -49,8 +49,8 @@@ TEST(CudnnDropout, Setup) 
  TEST(CudnnDropout, Forward) {
    const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
    size_t n = sizeof(x) / sizeof(float);
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{n}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{n}, cuda);
    in.CopyDataFromHostPtr(x, n);
  
    float pdrop = 0.5;
@@@ -67,9 -68,9 +68,8 @@@
    for (size_t i = 0; i < n; i++)
      EXPECT_FLOAT_EQ(0, GetBitValue(mptr, i) * (GetBitValue(mptr, i) - 1));
  
--  singa::CppCPU host(0, 1);
--  out1.ToDevice(&host);
-   const float* outptr1 = out1.data<const float*>();
++  out1.ToHost();
+   const float* outptr1 = out1.data<float>();
    EXPECT_EQ(n, out1.Size());
    float scale = 1.0f / (1.0f - pdrop);
    // the output value should be 0 or the same as the input
@@@ -78,9 -79,9 +78,9 @@@
    EXPECT_EQ(0.f, outptr1[7] * (outptr1[7] - scale * x[7]));
  
    singa::Tensor out2 = drop.Forward(singa::kEval, in);
--  out2.ToDevice(&host);
++  out2.ToHost();
    EXPECT_EQ(n, out2.Size());
-   const float* outptr2 = out2.data<const float*>();
+   const float* outptr2 = out2.data<float>();
    // the output value should be the same as the input
    EXPECT_EQ(x[0], outptr2[0]);
    EXPECT_EQ(x[1], outptr2[1]);
@@@ -90,8 -91,8 +90,8 @@@
  TEST(CudnnDropout, Backward) {
    const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
    size_t n = sizeof(x) / sizeof(float);
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{n}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{n}, cuda);
    in.CopyDataFromHostPtr(x, n);
  
    float pdrop = 0.5;
@@@ -105,14 -106,14 +105,13 @@@
    singa::Tensor out1 = drop.Forward(singa::kTrain, in);
  
    const float dy[] = {4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f};
--  singa::Tensor grad(singa::Shape{n}, &cuda);
++  singa::Tensor grad(singa::Shape{n}, cuda);
    grad.CopyDataFromHostPtr(dy, n);
  
    const auto ret = drop.Backward(singa::kTrain, grad);
--  singa::CppCPU host(0, 1);
    singa::Tensor in_grad = ret.first;
--  in_grad.ToDevice(&host);
-   const float* dx = in_grad.data<const float*>();
++  in_grad.ToHost();
+   const float* dx = in_grad.data<float>();
  
    singa::Tensor mask(drop.mask().shape(), drop.mask().data_type());
    mask.CopyData(drop.mask());

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_lrn.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_lrn.cc
index 390c588,f7ec046..4ee0c54
--- a/test/singa/test_cudnn_lrn.cc
+++ b/test/singa/test_cudnn_lrn.cc
@@@ -58,8 -58,8 +58,8 @@@ TEST(CudnnLRN, Forward) 
      0.0597329, -0.0530868, 0.0124246, 0.108429,
      0.0451175, 0.0247055, 0.0304345, 0.0179575
    };
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{1,2,4,4}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
    in.CopyDataFromHostPtr(x, 1*2*4*4);
  
    singa::LayerConf conf;
@@@ -68,12 -68,12 +68,11 @@@
    lrn_conf->set_local_size(3);
    lrn_conf->set_alpha(0.1);
    lrn_conf->set_beta(0.75);
-   lrn.Setup(conf);
+   lrn.Setup(Shape{2, 4, 4}, conf);
  
    singa::Tensor out = lrn.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
--  out.ToDevice(&host);
-   const float *outptr = out.data<const float *>();
++  out.ToHost();
+   const float *outptr = out.data<float>();
    const auto & shape = out.shape();
    EXPECT_EQ(4u, shape.size());
    EXPECT_EQ(1u, shape[0]);
@@@ -128,8 -128,8 +127,8 @@@ TEST(CudnnLRN, Backward) 
      0.0597329, -0.0530868, 0.0124246, 0.108429,
      0.0451175, 0.0247055, 0.0304345, 0.0179575
    };
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor x_tensor(singa::Shape{1,2,4,4}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
    x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
  
    const float dy[] = {
@@@ -143,7 -143,7 +142,7 @@@
      0.177807, 0.000892812, -0.00113197, 0.00327798
    };
  
--  singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, &cuda);
++  singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, cuda);
    dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
  
    singa::LayerConf conf;
@@@ -156,10 -156,10 +155,9 @@@
  
    lrn.Forward(singa::kTrain, x_tensor);
    const auto ret = lrn.Backward(singa::kTrain, dy_tensor);
--  singa::CppCPU host(0, 1);
    singa::Tensor dx = ret.first;
--  dx.ToDevice(&host);
-   const float *dxptr = dx.data<const float *>();
++  dx.ToHost();
+   const float *dxptr = dx.data<float>();
    const auto & shape = dx.shape();
    EXPECT_EQ(4u, shape.size());
    EXPECT_EQ(1u, shape[0]);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_pooling.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_pooling.cc
index e66f212,2a98ab4..79051a3
--- a/test/singa/test_cudnn_pooling.cc
+++ b/test/singa/test_cudnn_pooling.cc
@@@ -58,8 -56,8 +56,8 @@@ TEST(CudnnPooling, Forward) 
    const size_t batchsize = 1, c = 1, h = 3, w = 3;
    const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                            6.0f, 7.0f, 8.0f, 9.0f};
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
    in.CopyDataFromHostPtr(x, batchsize * c * h * w);
  
    CudnnPooling pool;
@@@ -79,9 -74,9 +74,8 @@@
  
    // Parameter "flag" does not influence pooling
    singa::Tensor out1 = pool.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
--  out1.ToDevice(&host);
-   const float *outptr1 = out1.data<const float *>();
++  out1.ToHost();
+   const float *outptr1 = out1.data<float>();
    // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
    EXPECT_EQ(4u, out1.Size());
    EXPECT_EQ(5.0f, outptr1[0]);
@@@ -95,8 -90,8 +89,8 @@@ TEST(CudnnPooling, Backward) 
    const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
    const float x[batchsize * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
                                                6.0f, 7.0f, 8.0f, 9.0f};
--  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
    in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
  
    CudnnPooling pool;
@@@ -119,14 -111,14 +110,13 @@@
    // grad
    const size_t grad_h = 2, grad_w = 2;
    const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
--  singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, &cuda);
++  singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, cuda);
    grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
  
    const auto ret = pool.Backward(singa::kTrain, grad);
--  singa::CppCPU host(0, 1);
    singa::Tensor in_grad = ret.first;
--  in_grad.ToDevice(&host);
-   const float *dx = in_grad.data<const float *>();
++  in_grad.ToHost();
+   const float *dx = in_grad.data<float>();
    EXPECT_EQ(9u, in_grad.Size());
    EXPECT_EQ(0.0f, dx[0]);
    EXPECT_EQ(0.0f, dx[1]);