You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2019/05/25 01:39:17 UTC
[incubator-mxnet] branch master updated: Allow clearing gpu cache
(#14252)
This is an automated email from the ASF dual-hosted git repository.
zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new db2295b Allow clearing gpu cache (#14252)
db2295b is described below
commit db2295b0f39852c3af2878932f48a672982f42d4
Author: vlado <vl...@gmail.com>
AuthorDate: Fri May 24 19:38:40 2019 -0600
Allow clearing gpu cache (#14252)
* Allow releasing all gpu memory
* fix white space
* stuck ci checks
* Fix whitespace
* Rename release_all -> empty_cache and provide documentation
* fix indentation
* Rename c_api's MXStorageReleaseAll -> MXStorageEmptyCache and clarify documention
* nudge ci
* Update context.py
---
include/mxnet/c_api.h | 6 ++++++
include/mxnet/storage.h | 8 ++++++++
python/mxnet/context.py | 18 ++++++++++++++++++
src/c_api/c_api.cc | 7 +++++++
src/storage/pooled_storage_manager.h | 6 ++++--
src/storage/storage.cc | 11 +++++++++++
src/storage/storage_manager.h | 8 ++++++++
7 files changed, 62 insertions(+), 2 deletions(-)
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 335154c..1c2300a 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -2746,6 +2746,12 @@ MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
mx_uint ndim, int dtype, NDArrayHandle *out);
+/*!
+ * \brief Release all unreferenced memory from the devices storage managers memory pool
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ */
+MXNET_DLL int MXStorageEmptyCache(int dev_type, int dev_id);
/*!
* \brief Reconstruct NDArray from shared memory handle
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index a8481c1..4d1fc3d 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -96,6 +96,14 @@ class Storage {
*/
virtual void DirectFree(Handle handle) = 0;
/*!
+ * \brief Release all memory from device if using a pooled storage manager
+ *
+ * This release all memory from pool storage managers such as
+ * GPUPooledStorageManager and GPUPooledRoundedStorageManager.
+ * For non-pool memory managers this has no effect.
+ */
+ virtual void ReleaseAll(Context ctx) = 0;
+ /*!
* \brief Destructor.
*/
virtual ~Storage() {}
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 15ea990..f284e00 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -145,6 +145,24 @@ class Context(with_metaclass(_MXClassPropertyMetaClass, object)):
cls._default_ctx.value = val
#pylint: enable=no-self-argument
+ def empty_cache(self):
+ """Empties the memory cache for the current contexts device.
+
+ MXNet utilizes a memory pool to avoid excessive allocations.
+ Calling empty_cache will empty the memory pool of the contexts
+ device. This will only free the memory of the unreferenced data.
+
+ Examples
+ -------
+ >>> ctx = mx.gpu(0)
+ >>> arr = mx.nd.ones((200,200), ctx=ctx)
+ >>> del arr
+ >>> ctx.empty_cache() # forces release of memory allocated for arr
+ """
+ dev_type = ctypes.c_int(self.device_typeid)
+ dev_id = ctypes.c_int(self.device_id)
+ check_call(_LIB.MXStorageEmptyCache(dev_type, dev_id))
+
# initialize the default context in Context
Context._default_ctx.value = Context('cpu', 0)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 536c535..7f8d5f5 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1528,3 +1528,10 @@ int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
API_END();
}
+
+int MXStorageEmptyCache(int dev_type, int dev_id) {
+ API_BEGIN();
+ Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+ Storage::Get()->ReleaseAll(ctx);
+ API_END();
+}
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 7726bc6..91eb536 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -85,6 +85,8 @@ class GPUPooledStorageManager final : public StorageManager {
DirectFreeNoLock(handle);
}
+ void ReleaseAll() override;
+
private:
void DirectFreeNoLock(Storage::Handle handle) {
mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
@@ -115,7 +117,6 @@ class GPUPooledStorageManager final : public StorageManager {
}
private:
- void ReleaseAll();
// used memory
size_t used_memory_ = 0;
// page size
@@ -250,6 +251,8 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
DirectFreeNoLock(handle);
}
+ void ReleaseAll() override;
+
private:
inline int div_pow2_round_up(size_t s, int divisor_log2) {
// (1025, 10) -> 2
@@ -284,7 +287,6 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
}
private:
- void ReleaseAll();
// number of devices
const int NDEV = 32;
// log2 of maximum page size. 16GB
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 0ca5ef7..7a59a77 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -39,6 +39,7 @@ class StorageImpl : public Storage {
void Alloc(Handle* handle) override;
void Free(Handle handle) override;
void DirectFree(Handle handle) override;
+ void ReleaseAll(Context ctx) override;
void SharedIncrementRefCount(Handle handle) override;
StorageImpl() {}
virtual ~StorageImpl() = default;
@@ -162,6 +163,16 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
profiler_.OnFree(handle);
}
+void StorageImpl::ReleaseAll(Context ctx) {
+ auto&& device = storage_managers_.at(ctx.dev_type);
+ std::shared_ptr<storage::StorageManager> manager = device.Get(
+ ctx.real_dev_id(), []() {
+ LOG(FATAL) << "Cannot Free space to a device you have not allocated";
+ return nullptr;
+ });
+ manager->ReleaseAll();
+}
+
void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {
CHECK_EQ(handle.ctx.dev_type, Context::kCPUShared);
auto&& device = storage_managers_.at(Context::kCPUShared);
diff --git a/src/storage/storage_manager.h b/src/storage/storage_manager.h
index d17dc91..13be16e 100644
--- a/src/storage/storage_manager.h
+++ b/src/storage/storage_manager.h
@@ -53,6 +53,14 @@ class StorageManager {
*/
virtual void DirectFree(Storage::Handle handle) = 0;
/*!
+ * \brief Release all memory if using a pool storage manager
+ *
+ * This release all memory from pool storage managers such as
+ * GPUPooledStorageManager and GPUPooledRoundedStorageManager.
+ * For non-pool memory managers this has no effect.
+ */
+ virtual void ReleaseAll() {}
+ /*!
* \brief Destructor.
*/
virtual ~StorageManager() = default;