You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2020/09/07 02:40:28 UTC

[incubator-mxnet] branch master updated: update runtime setting default values (#18987)

This is an automated email from the ASF dual-hosted git repository.

zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 04e394a  update runtime setting default values (#18987)
04e394a is described below

commit 04e394aa8f9f95439f8009e0ce13419b767ccc8a
Author: Sheng Zha <sz...@users.noreply.github.com>
AuthorDate: Sun Sep 6 19:39:20 2020 -0700

    update runtime setting default values (#18987)
---
 src/resource.cc                      |  4 ++--
 src/storage/pooled_storage_manager.h | 15 ++++++++++-----
 src/storage/storage.cc               |  3 ++-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/resource.cc b/src/resource.cc
index f4f9da2..ff5a9fb 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -96,9 +96,9 @@ class ResourceManagerImpl : public ResourceManager {
     cpu_temp_space_copy_ = dmlc::GetEnv("MXNET_CPU_TEMP_COPY", 4);
     gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 1);
     cpu_native_rand_copy_ = dmlc::GetEnv("MXNET_CPU_PARALLEL_RAND_COPY", 1);
-    gpu_native_rand_copy_ = dmlc::GetEnv("MXNET_GPU_PARALLEL_RAND_COPY", 4);
+    gpu_native_rand_copy_ = dmlc::GetEnv("MXNET_GPU_PARALLEL_RAND_COPY", 1);
 #if MXNET_USE_CUDNN == 1
-    gpu_cudnn_dropout_state_copy_ = dmlc::GetEnv("MXNET_GPU_CUDNN_DROPOUT_STATE_COPY", 4);
+    gpu_cudnn_dropout_state_copy_ = dmlc::GetEnv("MXNET_GPU_CUDNN_DROPOUT_STATE_COPY", 1);
 #endif  // MXNET_USE_CUDNN == 1
     engine_ref_ = Engine::_GetSharedRef();
     storage_ref_ = Storage::_GetSharedRef();
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 897725a..194a1a2 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -178,14 +178,19 @@ void PooledStorageManager<BucketingStrategy, StoringMethod>::Alloc(Storage::Hand
     void *ret = nullptr;
     auto e = contextHelper_->Malloc(&ret, roundSize);
     if (e) {
-      const std::string err(
+      // retry in case of fragmentation
+      ReleaseAllNoLock(false);
+      e = contextHelper_->Malloc(&ret, roundSize);
+      if (e) {
+        const std::string err(
 #if MXNET_USE_CUDA
-      dev_type_ == Context::kGPU?
-         cudaGetErrorString(static_cast<cudaError_t>(e)) :
+        dev_type_ == Context::kGPU?
+           cudaGetErrorString(static_cast<cudaError_t>(e)) :
 #endif
-         std::strerror(errno));
+           std::strerror(errno));
 
-      LOG(FATAL) << "Memory allocation failed " << err;
+        LOG(FATAL) << "Memory allocation failed " << err;
+      }
     }
 
     UNSET_DEVICE(device_store);
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index f359b30..d83860c 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -67,8 +67,9 @@ StorageManager *CreateStorageManager(const Context &ctx, const char *context,
                                      int num_gpu_device, std::string *pStrategy) {
   const auto env_var = env_var_name(context, pool_type);
   const char *type = getenv(env_var.c_str());
-  if (type == nullptr)
+  if (type == nullptr) {
     type = "Naive";   // default pool
+  }
 
   *pStrategy = type;
   StorageManager *ptr = nullptr;