You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2019/05/25 01:39:17 UTC
[incubator-mxnet] branch master updated: Allow clearing gpu cache (#14252)

This is an automated email from the ASF dual-hosted git repository.

zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new db2295b  Allow clearing gpu cache (#14252)
db2295b is described below

commit db2295b0f39852c3af2878932f48a672982f42d4
Author: vlado <vl...@gmail.com>
AuthorDate: Fri May 24 19:38:40 2019 -0600

    Allow clearing gpu cache (#14252)
    
    * Allow releasing all gpu memory
    
    * fix white space
    
    * stuck ci checks
    
    * Fix whitespace
    
    * Rename release_all -> empty_cache and provide documentation
    
    * fix indentation
    
    * Rename c_api's MXStorageReleaseAll -> MXStorageEmptyCache and clarify documention
    
    * nudge ci
    
    * Update context.py
---
 include/mxnet/c_api.h                |  6 ++++++
 include/mxnet/storage.h              |  8 ++++++++
 python/mxnet/context.py              | 18 ++++++++++++++++++
 src/c_api/c_api.cc                   |  7 +++++++
 src/storage/pooled_storage_manager.h |  6 ++++--
 src/storage/storage.cc               | 11 +++++++++++
 src/storage/storage_manager.h        |  8 ++++++++
 7 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 335154c..1c2300a 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -2746,6 +2746,12 @@ MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
 MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
                                            mx_uint ndim, int dtype, NDArrayHandle *out);
 
+/*!
+ * \brief Release all unreferenced memory from the devices storage managers memory pool
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ */
+MXNET_DLL int MXStorageEmptyCache(int dev_type, int dev_id);
 
 /*!
  * \brief Reconstruct NDArray from shared memory handle
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index a8481c1..4d1fc3d 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -96,6 +96,14 @@ class Storage {
    */
   virtual void DirectFree(Handle handle) = 0;
   /*!
+  * \brief Release all memory from device if using a pooled storage manager
+  *
+  * This release all memory from pool storage managers such as
+  * GPUPooledStorageManager and GPUPooledRoundedStorageManager.
+  * For non-pool memory managers this has no effect.
+  */
+  virtual void ReleaseAll(Context ctx) = 0;
+  /*!
    * \brief Destructor.
    */
   virtual ~Storage() {}
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 15ea990..f284e00 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -145,6 +145,24 @@ class Context(with_metaclass(_MXClassPropertyMetaClass, object)):
         cls._default_ctx.value = val
     #pylint: enable=no-self-argument
 
+    def empty_cache(self):
+        """Empties the memory cache for the current contexts device.
+
+        MXNet utilizes a memory pool to avoid excessive allocations.
+        Calling empty_cache will empty the memory pool of the contexts
+        device. This will only free the memory of the unreferenced data.
+
+        Examples
+        -------
+        >>> ctx = mx.gpu(0)
+        >>> arr = mx.nd.ones((200,200), ctx=ctx)
+        >>> del arr
+        >>> ctx.empty_cache() # forces release of memory allocated for arr
+        """
+        dev_type = ctypes.c_int(self.device_typeid)
+        dev_id = ctypes.c_int(self.device_id)
+        check_call(_LIB.MXStorageEmptyCache(dev_type, dev_id))
+
 # initialize the default context in Context
 Context._default_ctx.value = Context('cpu', 0)
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 536c535..7f8d5f5 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1528,3 +1528,10 @@ int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
 
   API_END();
 }
+
+int MXStorageEmptyCache(int dev_type, int dev_id) {
+  API_BEGIN();
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+  Storage::Get()->ReleaseAll(ctx);
+  API_END();
+}
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 7726bc6..91eb536 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -85,6 +85,8 @@ class GPUPooledStorageManager final : public StorageManager {
     DirectFreeNoLock(handle);
   }
 
+  void ReleaseAll() override;
+
  private:
   void DirectFreeNoLock(Storage::Handle handle) {
     mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
@@ -115,7 +117,6 @@ class GPUPooledStorageManager final : public StorageManager {
   }
 
  private:
-  void ReleaseAll();
   // used memory
   size_t used_memory_ = 0;
   // page size
@@ -250,6 +251,8 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
     DirectFreeNoLock(handle);
   }
 
+  void ReleaseAll() override;
+
  private:
   inline int div_pow2_round_up(size_t s, int divisor_log2) {
     // (1025, 10) -> 2
@@ -284,7 +287,6 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
   }
 
  private:
-  void ReleaseAll();
   // number of devices
   const int NDEV = 32;
   // log2 of maximum page size. 16GB
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 0ca5ef7..7a59a77 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -39,6 +39,7 @@ class StorageImpl : public Storage {
   void Alloc(Handle* handle) override;
   void Free(Handle handle) override;
   void DirectFree(Handle handle) override;
+  void ReleaseAll(Context ctx) override;
   void SharedIncrementRefCount(Handle handle) override;
   StorageImpl() {}
   virtual ~StorageImpl() = default;
@@ -162,6 +163,16 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
   profiler_.OnFree(handle);
 }
 
+void StorageImpl::ReleaseAll(Context ctx) {
+  auto&& device = storage_managers_.at(ctx.dev_type);
+  std::shared_ptr<storage::StorageManager> manager = device.Get(
+    ctx.real_dev_id(), []() {
+    LOG(FATAL) << "Cannot Free space to a device you have not allocated";
+    return nullptr;
+  });
+  manager->ReleaseAll();
+}
+
 void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {
   CHECK_EQ(handle.ctx.dev_type, Context::kCPUShared);
   auto&& device = storage_managers_.at(Context::kCPUShared);
diff --git a/src/storage/storage_manager.h b/src/storage/storage_manager.h
index d17dc91..13be16e 100644
--- a/src/storage/storage_manager.h
+++ b/src/storage/storage_manager.h
@@ -53,6 +53,14 @@ class StorageManager {
    */
   virtual void DirectFree(Storage::Handle handle) = 0;
   /*!
+  * \brief Release all memory if using a pool storage manager
+  *
+  * This release all memory from pool storage managers such as
+  * GPUPooledStorageManager and GPUPooledRoundedStorageManager.
+  * For non-pool memory managers this has no effect.
+  */
+  virtual void ReleaseAll() {}
+  /*!
    * \brief Destructor.
    */
   virtual ~StorageManager() = default;