You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/06/24 06:51:36 UTC
[3/6] incubator-singa git commit: SINGA-175 Add memory management APIs and implement a subclass using CNMeM

SINGA-175 Add memory management APIs and implement a subclass using CNMeM

Add base memory pool class.
Implement two subclasses, CnMemPool and CudaMemPool.
Add test for the memory pools.

TODO replace Device* to std::shared_ptr<Device> to avoid memory error because
the order of destructing device and tensor are dynamic (device may be freed
before tensors)


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/077d13e8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/077d13e8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/077d13e8

Branch: refs/heads/dev
Commit: 077d13e8052aa92679909b619966481a383a651f
Parents: ce3e6dc
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Wed Jun 22 20:26:41 2016 +0800
Committer: liyuchenmike@gmail.com <li...@gmail.com>
Committed: Wed Jun 22 20:26:41 2016 +0800

----------------------------------------------------------------------
 CMakeLists.txt                 |   6 +-
 include/singa/core/device.h    |   5 ++
 include/singa/core/memory.h    |  46 +++++++++++++++
 include/singa/model/loss.h     |   2 +-
 src/core/device/cuda_gpu.cc    |  59 +++++++++++++++++--
 src/core/memory/memory.cc      |  69 ++++++++++++++++++++++
 src/proto/core.proto           |  13 +++++
 test/singa/test_memory.cc      | 111 ++++++++++++++++++++++++++++++++++++
 test/singa/test_mse.cc         |  13 ++++-
 test/singa/test_tensor_math.cc |   4 ++
 10 files changed, 319 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f6240d2..c34b6ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,14 +13,15 @@ ENDIF()
 
 #message(STATUS "${CMAKE_CXX_FLAGS}")
 SET(SINGA_INCLUDE_DIR
-    "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${PROJECT_BINARY_DIR}")
+    #"${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/lib;${CMAKE_SOURCE_DIR}/lib/cnmen/include;${PROJECT_BINARY_DIR}")
+    "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/include;${PROJECT_BINARY_DIR}")
 #message(STATUS "include path: ${SINGA_INCLUDE_DIR}")
 INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
 
 #OPTION(CPU_ONLY "use GPU libs" OFF)
 OPTION(USE_CBLAS "Use CBlas libs" ON)
 OPTION(USE_CUDA "Use Cuda libs" ON)
-OPTION(USE_CUDNN "Use Cudnn libs" ON)
+OPTION(USE_CUDNN "Use Cudnn libs" OFF)
 OPTION(USE_OPENCV "Use opencv" OFF)
 OPTION(USE_LMDB "Use LMDB libs" OFF)
 
@@ -38,5 +39,6 @@ SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 
 ADD_SUBDIRECTORY(lib/cnmem)
+LIST(APPEND SINGA_LINKER_LIBS cnmem)
 ADD_SUBDIRECTORY(src)
 ADD_SUBDIRECTORY(test)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 8c95dc7..fc98a23 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -147,6 +147,8 @@ class CudaGPU : public Device {
   ~CudaGPU();
   CudaGPU(int id = 0, int num_executors = 1, string scheduler = "sync",
          string vm = "gc-only");
+	CudaGPU(const MemPoolConf& mem_conf, 
+					int id = 0, int num_executors = 1, string scheduler = "sync");
 
   void SetRandSeed(unsigned seed) override;
   static void DeviceQuery();
@@ -180,6 +182,9 @@ class CudaGPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
+	
+	private:
+	DeviceMemPool* pool;
 };
 
 /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/core/memory.h
----------------------------------------------------------------------
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index db09043..e4e1e63 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -19,10 +19,56 @@
 #ifndef SINGA_CORE_MEMORY_H_
 #define SINGA_CORE_MEMORY_H_
 
+#include "cnmem.h"
+#include <mutex>
+
 namespace singa {
 
 /// Manage device memory pool including garbage collection, memory opt.
 class VirtualMemory {};
 
+class DeviceMemPool {
+	public:
+	virtual void InitPool() = 0;
+	virtual void Malloc(void** ptr, const size_t size) = 0;
+	virtual void Free(void* ptr) = 0;
+	virtual ~DeviceMemPool(){};
+};
+
+class CnMemPool : public DeviceMemPool {
+	public:
+	int status = 1;
+
+	void InitPool();
+
+	/// numDevices: total number of available GPU cards.
+	/// initSize: all devices will be allocated with this size 
+	/// manager_flags: pool manager flag (one for all devices)
+	/// flag = 0; default flag
+	/// flag = 1: Prevent the manager from growing its memory consumption
+	/// flag = 2; Prevent the manager from stealing memory.
+	void InitPool(int numDevices, size_t initSize, unsigned flag);
+
+	void Malloc(void** ptr, const size_t size);
+	void Free(void* ptr);
+
+	// release all memory and set cnmem manager to unintialized 
+	~CnMemPool();
+
+	private:
+	// whether the (global) memory pool has been initialized
+	static bool initialized;
+	// lock on the initialized variable
+	static std::mutex mtx;
+};
+
+class CudaMemPool : public DeviceMemPool {
+	public:
+	void InitPool(){};
+	void Malloc(void** ptr, const size_t size);
+	void Free(void* ptr);
+	~CudaMemPool(){};
+};
+
 }  // namespace singa
 #endif  // SINGA_CORE_MEMORY_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
index 6a23067..dcf0da4 100644
--- a/include/singa/model/loss.h
+++ b/include/singa/model/loss.h
@@ -35,7 +35,7 @@ class Loss {
     loss.ParseFromString(conf);
     Setup(loss);
   }
-
+	virtual ~Loss(){};
   /// Set meta fields from user configurations.
   virtual void Setup(const LossConf& conf) {}
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index a47f6fe..d9a0985 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -22,7 +22,7 @@
 #include <cuda_runtime.h>
 #include <curand.h>
 #include <chrono>
-
+#include <iostream>
 #include "singa/core/device.h"
 #include "singa/utils/cuda_utils.h"
 namespace singa {
@@ -42,6 +42,8 @@ CudaGPU::~CudaGPU() {
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
   }
 #endif
+	delete pool;
+	LOG(INFO) << "device has been deleted";
 }
 
 CudaGPU::CudaGPU(int id, int num_executors,
@@ -67,6 +69,48 @@ CudaGPU::CudaGPU(int id, int num_executors,
   auto status = cudnnCreate(&ctx_.cudnn_handle);
   CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
 #endif  // USE_CUDNN
+	
+	// initialize cnmem memory management as default
+	pool = new CnMemPool();
+	((CnMemPool*)pool)->InitPool();
+}
+
+CudaGPU::CudaGPU(const MemPoolConf& mem_conf,int id, int num_executors,
+                       string scheduler)
+    : Device(id, num_executors, scheduler, "gc-only") {
+  if (id == -1)
+    id = FindDevice(0);
+  lang_ = kCuda;
+  ctx_.stream = NULL;  // use the default sync stream
+  // TODO(wangwei) create one handle for each steam?
+  CUDA_CHECK(cudaSetDevice(FindDevice(0)));
+  // use curandCreateGeneratorHost for CudaHost device
+  CURAND_CHECK(
+      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  SetRandSeed(seed);
+  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
+
+#ifdef USE_CUDNN
+  // TODO(wangwei) create one handle for each stream?
+  auto status = cudnnCreate(&ctx_.cudnn_handle);
+  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif  // USE_CUDNN
+
+	// initialize memory management for cuda devices
+	string memoryPoolType = mem_conf.type();
+	if(memoryPoolType.compare("cnmem") == 0) {
+		pool = new CnMemPool();
+		int num_devices = mem_conf.num_devices();
+		size_t alloc_size = mem_conf.alloc_size();
+		unsigned flag = mem_conf.cnmemflag();
+		((CnMemPool*)pool)->InitPool(num_devices, alloc_size, flag);
+	}
+	else {
+		pool = new CudaMemPool();
+	}
 }
 
 void CudaGPU::SetRandSeed(unsigned seed) {
@@ -90,7 +134,8 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
 void* CudaGPU::Malloc(int size) {
   void* ptr = nullptr;
   if (size > 0) {
-    CUDA_CHECK(cudaMalloc(&ptr, size));
+		//CUDA_CHECK(cudaMalloc((void**)&ptr,size));
+		pool->Malloc((void**)&ptr,size);
     CUDA_CHECK(cudaMemset(ptr, 0, size));
   }
   return ptr;
@@ -98,8 +143,14 @@ void* CudaGPU::Malloc(int size) {
 
   /// Free cpu memory.
 void CudaGPU::Free(void* ptr) {
-  if (ptr != nullptr)
-    CUDA_CHECK(cudaFree(ptr));
+	LOG(INFO) << "Cuda free is called";
+	LOG(INFO) << "pool pointer" << pool << "\n";
+	LOG(INFO) << "pool status:" << ((CnMemPool*)pool)->status;
+  if (ptr != nullptr) {
+		//CUDA_CHECK(cudaFree(ptr));
+		pool->Free(ptr);
+	}
+	LOG(INFO) << "free memory is successed";
 }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/core/memory/memory.cc
----------------------------------------------------------------------
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index a1cf5db..c5878a6 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -18,3 +18,72 @@
 
 
 #include "singa/core/memory.h"
+#include "singa/utils/logging.h"
+#include <iostream>
+
+namespace singa {
+
+bool singa::CnMemPool::initialized = false;
+std::mutex singa::CnMemPool::mtx;
+
+void CnMemPool::InitPool(int numDevices, size_t initSize, unsigned flag) {
+	mtx.lock();
+	if(!initialized) {
+		CHECK_GE(numDevices, 1);
+		cnmemDevice_t* settingPtr = new cnmemDevice_t[numDevices];
+		for(int i = 0; i < numDevices; i++) {
+			settingPtr[i].device = i;
+			settingPtr[i].size = initSize;
+			settingPtr[i].numStreams = 0;
+			settingPtr[i].streams = NULL;
+			settingPtr[i].streamSizes = 0;
+		}
+		cnmemStatus_t status = cnmemInit(numDevices, settingPtr, flag);
+		CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+		delete[] settingPtr;
+		initialized = true;
+	}
+	mtx.unlock();
+}
+
+void CnMemPool::InitPool() {
+		int defaultNumDevices = 1;
+		size_t defaultSize = 1000000U;
+		InitPool(defaultNumDevices,defaultSize,cnmemManagerFlags_t::CNMEM_FLAGS_DEFAULT);
+}
+
+CnMemPool::~CnMemPool() {
+	mtx.lock();
+	if(initialized) {
+		cnmemStatus_t status = cnmemFinalize();
+		CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+		initialized = false;
+	}
+	mtx.unlock();
+	LOG(INFO) << "cnmem has been freed";
+}
+
+
+void CnMemPool::Malloc(void** ptr, const size_t size) {
+	cnmemStatus_t status = cnmemMalloc(ptr,size,NULL);
+	CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+}
+
+void CnMemPool::Free(void* ptr) {
+	LOG(INFO) << "cnmem free is called !!!!!!!!!!!";
+	cnmemStatus_t status = cnmemFree(ptr,NULL);
+	CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS) << " " << cnmemGetErrorString(status);
+	LOG(INFO) << "cnmem free is terminated";
+}
+
+void CudaMemPool::Malloc(void** ptr, const size_t size) {
+	cudaError_t status = cudaMalloc(ptr,size);
+	CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+void CudaMemPool::Free(void* ptr) {
+	cudaError_t status = cudaFree(ptr);
+	CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index 88d7f12..cf6e193 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -44,3 +44,16 @@ enum CopyDirection {
   kDeviceToDevice = 3;
   kNumDirection = 4;
 }
+
+// configuration for device memory pool
+message MemPoolConf {
+	optional string type = 1 [default = "cnmem"];
+	optional uint32 num_devices = 2 [default = 1];
+	// allocation size for each device
+	optional uint32 alloc_size = 3 [default = 10000000];
+	// memory manager flag for cnmem
+	// cnmemflag = 0: default flag
+	// cnmemflag = 1: prevent the manager from growing its memory consumption
+	// cnmemflag = 2: prevent the manager from stealing memory
+	optional uint32 cnmemflag = 4 [default = 0];
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
new file mode 100644
index 0000000..f5e464d
--- /dev/null
+++ b/test/singa/test_memory.cc
@@ -0,0 +1,111 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* 
+*   http://www.apache.org/licenses/LICENSE-2.0
+* 
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/utils/logging.h"
+#include "singa/core/memory.h"
+#include "singa/singa_config.h"
+#include <sys/time.h>
+
+#ifdef USE_CUDA
+TEST(CnmemPool, PoolInit) {
+	singa::CnMemPool pool;
+	pool.InitPool();
+}
+
+TEST(CnmemPool, PoolInitAll) {
+	singa::CnMemPool pool;
+	int nDevices;
+	cudaGetDeviceCount(&nDevices);
+	CHECK_GE(nDevices,1);
+	pool.InitPool(nDevices,1000000U,0);
+}
+
+TEST(CnmemPool, UsePool) {
+	singa::CnMemPool pool;
+	pool.InitPool();
+	int numOfTests = 10;
+	int numOfWriteVsRead = 3;
+	int allocSize = 1000000U;
+	for(int i = 0; i < numOfTests; i++) {
+		int** memPtrs = new int*[numOfWriteVsRead];
+		for(int j = 0; j < numOfWriteVsRead; j++) {
+			pool.Malloc((void**)(&memPtrs[j]), allocSize); 
+		}
+		pool.Free(memPtrs[0]);
+		delete[] memPtrs;
+	}
+}
+
+TEST(CudaMemPool, UsePool) {
+	singa::CudaMemPool pool;
+	int numOfTests = 10;
+	int numOfWriteVsRead = 3;
+	int allocSize = 1000000U;
+	for(int i = 0; i < numOfTests; i++) {
+		int** memPtrs = new int*[numOfWriteVsRead];
+		for(int j = 0; j < numOfWriteVsRead; j++) {
+			pool.Malloc((void**)(&memPtrs[j]), allocSize); 
+		}
+		pool.Free(memPtrs[0]);
+		delete[] memPtrs;
+	}
+}
+
+TEST(MemPool, CompareCudaCnmem) {
+	singa::CudaMemPool cudaPool;
+	singa::CnMemPool cnPool;
+	cnPool.InitPool();
+
+	int numOfTests = 10000;
+	int allocSize = 1000000U;
+	struct timeval start,end;
+	double t1,t2;
+
+	singa::DeviceMemPool* pool = NULL;
+	pool = &cnPool;
+	
+	gettimeofday(&start,NULL);
+	for(int i = 0; i < numOfTests; i++) {
+		int* memPtrs = NULL;
+		pool->Malloc((void**)&memPtrs, allocSize); 	
+		pool->Free(memPtrs);
+	}
+	gettimeofday(&end,NULL);
+	
+	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
+	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
+	LOG(INFO) << "cnmem time: " << t2-t1 << " ms" << std::endl;
+
+	pool = &cudaPool;
+	gettimeofday(&start,NULL);
+	for(int i = 0; i < numOfTests; i++) {
+		int* memPtrs = NULL;
+		pool->Malloc((void**)&memPtrs, allocSize); 
+		pool->Free(memPtrs);
+	}
+	gettimeofday(&end,NULL);
+	
+	t1 = start.tv_sec * 1000 + start.tv_usec/1000;
+	t2 = end.tv_sec * 1000 + end.tv_usec/1000;
+	LOG(INFO) << "cuda time: " << t2-t1 << " ms" << std::endl;
+}
+#endif // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 2c02273..7c6066e 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -68,11 +68,11 @@ TEST_F(TestMSE, CppBackward) {
 #endif
 #ifdef USE_CUDA
 TEST_F(TestMSE, CudaForward) {
-  singa::MSE mse;
+  singa::MSE* mse = new singa::MSE();
   singa::CudaGPU dev;
   p.ToDevice(&dev);
   t.ToDevice(&dev);
-  Tensor loss = mse.Forward(p, t);
+  Tensor loss = mse->Forward(p, t);
 
   loss.ToHost();
   auto ldat = loss.data<const float*>();
@@ -85,6 +85,12 @@ TEST_F(TestMSE, CudaForward) {
     }
     EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
   }
+	LOG(INFO) << "Before delete pxxxxxxxxxxxxxxxxxxxxxxxx";
+	p.ToHost();
+	LOG(INFO) << "Before delete tyyyyyyyyyyyyyyyyyyyyyyy";
+	t.ToHost();
+	LOG(INFO) << "terminate-xxxxxxxxxxxxxxxxxx-";
+	delete mse;
 }
 TEST_F(TestMSE, CudaBackward) {
   singa::MSE mse;
@@ -98,5 +104,8 @@ TEST_F(TestMSE, CudaBackward) {
 
   for (size_t i = 0; i < grad.Size(); i++)
     EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
+	p.ToHost();
+	t.ToHost();
+
 }
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/077d13e8/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 170b96c..b18e465 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -302,6 +302,8 @@ TEST_F(TestTensorMath, MultCuda) {
       EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
     }
   }
+	d.ToHost();
+	p.ToHost();
 }
 
 TEST_F(TestTensorMath, AddColumnCuda) {
@@ -479,6 +481,7 @@ TEST_F(TestTensorMath, SumRowsCuda) {
     }
     EXPECT_FLOAT_EQ(tptr[i], tmp);
   }
+	d.ToHost();
 }
 TEST_F(TestTensorMath, SumColumnCuda) {
   singa::CudaGPU dev;
@@ -495,5 +498,6 @@ TEST_F(TestTensorMath, SumColumnCuda) {
     }
     EXPECT_FLOAT_EQ(tptr[i], tmp);
   }
+	d.ToHost();
 }
 #endif