You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2019/01/05 01:55:25 UTC
[GitHub] eric-haibin-lin closed pull request #13764: Less cudaGet/SetDevice calls in Gluon execution

eric-haibin-lin closed pull request #13764: Less cudaGet/SetDevice calls in Gluon execution
URL: https://github.com/apache/incubator-mxnet/pull/13764
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 047edde88a5..0dd9d2db372 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -286,22 +286,35 @@ inline DType __device__ CudaMin(DType a, DType b) {
 class DeviceStore {
  public:
   /*! \brief default constructor- only optionally restores previous device */
-  explicit DeviceStore(bool restore = true) : restore_(restore) {
+  explicit DeviceStore(int requested_device = -1, bool restore = true) :
+    restore_device_(-1),
+    current_device_(requested_device),
+    restore_(restore) {
     if (restore_)
       CUDA_CALL(cudaGetDevice(&restore_device_));
+    if (requested_device != restore_device_) {
+      SetDevice(requested_device);
+    }
   }
 
   ~DeviceStore() {
-    if (restore_)
+    if (restore_ &&
+        current_device_ != restore_device_ &&
+        current_device_ != -1 &&
+        restore_device_ != -1)
       CUDA_CALL(cudaSetDevice(restore_device_));
   }
 
   void SetDevice(int device) {
-    CUDA_CALL(cudaSetDevice(device));
+    if (device != -1) {
+      CUDA_CALL(cudaSetDevice(device));
+      current_device_ = device;
+    }
   }
 
  private:
   int restore_device_;
+  int current_device_;
   bool restore_;
 };
 
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index d4ac042ff40..516e04bf5e8 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -65,9 +65,6 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
     Context const& ctx) {
   RunContext ret;
-#if MXNET_USE_CUDA
-  mxnet::common::cuda::DeviceStore device_store;
-#endif
   switch (ctx.dev_mask()) {
     case cpu::kDevMask:
       ret = RunContext{ctx, nullptr};
@@ -75,11 +72,11 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
       std::size_t use_counter;
-      device_store.SetDevice(ctx.dev_id);
       {
         std::lock_guard<std::mutex> lock{mutex_};
         auto&& counter = gpu_cnt_.at(ctx.dev_id);
         if (counter == -1) {
+          mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
           for (auto&& i : gpu_streams_.at(ctx.dev_id)) {
             i = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0, ctx.dev_id);
           }
@@ -104,19 +101,16 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
     Context const& ctx) {
   RunContext ret;
-#if MXNET_USE_CUDA
-  mxnet::common::cuda::DeviceStore device_store;
-#endif
   switch (ctx.dev_mask()) {
     case cpu::kDevMask:
       ret = RunContext{ctx, nullptr};
       break;
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
-      device_store.SetDevice(ctx.dev_id);
       {
         std::lock_guard<std::mutex> lock{mutex_};
         if (gpu_io_streams_.at(ctx.dev_id) == nullptr) {
+          mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
           gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
         }
       }
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index 1abb82fd6a6..c6eb99508e0 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -31,6 +31,9 @@
 #include "./threaded_engine.h"
 #include "./thread_pool.h"
 #include "./stream_manager.h"
+#if MXNET_USE_CUDA
+#include "../common/cuda_utils.h"
+#endif
 
 namespace mxnet {
 namespace engine {
@@ -130,10 +133,13 @@ class ThreadedEnginePooled : public ThreadedEngine {
    * \param opr_block The operator block.
    */
   void DoExecute(OprBlock* opr_block) {
+#if MXNET_USE_CUDA
+    mxnet::common::cuda::DeviceStore device_store(-1, false);
+#endif
     assert(opr_block->wait.load() == 0);
     if (opr_block->ctx.dev_mask() == gpu::kDevMask) {
       #if MXNET_USE_CUDA
-      CUDA_CALL(cudaSetDevice(opr_block->ctx.dev_id));
+      device_store.SetDevice(opr_block->ctx.dev_id);
       #else   // MXNET_USE_CUDA
       LOG(FATAL) << "Please compile with CUDA enabled";
       #endif  // MXNET_USE_CUDA
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 7090aaf46d8..08f6155cb5b 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -724,10 +724,9 @@ class CommDevice : public Comm {
     int enabled = 0;
     std::vector<int> p2p(n*n);
 
-    // Restores active device to what it was before EnableP2P
-    mxnet::common::cuda::DeviceStore device_store;
     for (int i = 0; i < n; ++i) {
-     device_store.SetDevice(gpus[i]);
+      // Restores active device to what it was before EnableP2P
+      mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
         int access;
         cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index e3b2ad7f57d..b62228cd288 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -339,9 +339,8 @@ class CommDeviceTree : public CommDevice {
     int n = static_cast<int>(gpus.size());
     int enabled = 0;
     std::vector<int> p2p(n*n);
-    mxnet::common::cuda::DeviceStore device_store;
     for (int i = 0; i < n; ++i) {
-      device_store.SetDevice(gpus[i]);
+      mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
         int access;
         cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 43e98fe04a1..25ad61efb23 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -43,12 +43,12 @@ class CPUDeviceStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 
  private:
   /*!
@@ -63,7 +63,8 @@ class CPUDeviceStorage {
 #endif
 };  // class CPUDeviceStorage
 
-inline void* CPUDeviceStorage::Alloc(size_t size) {
+inline void* CPUDeviceStorage::Alloc(Storage::Handle* handle) {
+  const size_t size = handle->size;
   void* ptr;
 #if _MSC_VER
   ptr = _aligned_malloc(size, alignment_);
@@ -75,7 +76,8 @@ inline void* CPUDeviceStorage::Alloc(size_t size) {
   return ptr;
 }
 
-inline void CPUDeviceStorage::Free(void* ptr) {
+inline void CPUDeviceStorage::Free(Storage::Handle handle) {
+  void * ptr = handle.dptr;
 #if _MSC_VER
   _aligned_free(ptr);
 #else
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 435c7e81d2a..562badb8752 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -46,17 +46,19 @@ class GPUDeviceStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 };  // class GPUDeviceStorage
 
-inline void* GPUDeviceStorage::Alloc(size_t size) {
+inline void* GPUDeviceStorage::Alloc(Storage::Handle* handle) {
+  const size_t size = handle->size;
   void* ret = nullptr;
 #if MXNET_USE_CUDA
+  mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
@@ -69,8 +71,10 @@ inline void* GPUDeviceStorage::Alloc(size_t size) {
   return ret;
 }
 
-inline void GPUDeviceStorage::Free(void* ptr) {
+inline void GPUDeviceStorage::Free(Storage::Handle handle) {
 #if MXNET_USE_CUDA
+  void * ptr = handle.dptr;
+  mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index b05b242a799..55112b5a82e 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -49,7 +49,7 @@ class NaiveStorageManager final : public StorageManager {
   void Free(Storage::Handle handle) override;
 
   void DirectFree(Storage::Handle handle) override {
-    DeviceStorage::Free(handle.dptr);
+    DeviceStorage::Free(handle);
   }
 
  private:
@@ -58,12 +58,12 @@ class NaiveStorageManager final : public StorageManager {
 
 template <class DeviceStorage>
 void NaiveStorageManager<DeviceStorage>::Alloc(Storage::Handle* handle) {
-  handle->dptr = DeviceStorage::Alloc(handle->size);
+  handle->dptr = DeviceStorage::Alloc(handle);
 }
 
 template <class DeviceStorage>
 void NaiveStorageManager<DeviceStorage>::Free(Storage::Handle handle) {
-  DeviceStorage::Free(handle.dptr);
+  DeviceStorage::Free(handle);
 }
 
 }  // namespace storage
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index e3fec2f4a06..c4ababbdc03 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -41,29 +41,33 @@ class PinnedMemoryStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
 
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 };
 
-inline void* PinnedMemoryStorage::Alloc(size_t size) {
+inline void* PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
   void* ret = nullptr;
+  const size_t size = handle->size;
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
+  mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
   // make the memory available across all devices
   CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable));
   return ret;
 }
 
-inline void PinnedMemoryStorage::Free(void* ptr) {
+inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
+  void * ptr = handle.dptr;
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
+  mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
   cudaError_t err = cudaFreeHost(ptr);
   // ignore unloading error, as memory has already been recycled
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index cade8d9495f..c407a9f00cb 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -84,6 +84,7 @@ class GPUPooledStorageManager final : public StorageManager {
 
  private:
   void DirectFreeNoLock(Storage::Handle handle) {
+    mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
     cudaError_t err = cudaFree(handle.dptr);
     size_t size = RoundAllocSize(handle.size);
     // ignore unloading error, as memory has already been recycled
@@ -132,6 +133,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
   size_t size = RoundAllocSize(handle->size);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
+    mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
     size_t free, total;
     cudaMemGetInfo(&free, &total);
     if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
@@ -252,6 +254,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
   }
 
   void DirectFreeNoLock(Storage::Handle handle) {
+    mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
     cudaError_t err = cudaFree(handle.dptr);
     size_t size = get_size(get_bucket(handle.size));
     // ignore unloading error, as memory has already been recycled
@@ -288,6 +291,7 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
   size_t size = get_size(bucket);
   auto&& reuse_pool = memory_pool_[bucket];
   if (reuse_pool.size() == 0) {
+    mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
     size_t free, total;
     cudaMemGetInfo(&free, &total);
     if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index c7100a456d8..911d30cc3f0 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -48,35 +48,6 @@ class StorageImpl : public Storage {
   static int num_gpu_device;
 #endif  // MXNET_USE_CUDA
 
-  static void ActivateDevice(Context ctx) {
-    switch (ctx.dev_type) {
-      case Context::kCPU:
-        break;
-      case Context::kCPUPinned:
-#if MXNET_USE_CUDA
-        if (num_gpu_device > 0) {
-          CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
-        }
-#endif  // MXNET_USE_CUDA
-        break;
-      case Context::kCPUShared: {
-#if defined(ANDROID) || defined(__ANDROID__)
-        LOG(FATAL) << "Unimplemented device";
-#endif  // defined(ANDROID) || defined(__ANDROID__)
-      }
-        break;
-      case Context::kGPU: {
-#if MXNET_USE_CUDA
-          if (num_gpu_device > 0) {
-            CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
-          }
-#endif  // MXNET_USE_CUDA
-          break;
-        }
-      default:
-        LOG(FATAL) << "Unimplemented device";
-    }
-  }
   // internal storage managers
   std::array<common::LazyAllocArray<storage::StorageManager>,
              kMaxNumberOfDevices> storage_managers_;
@@ -100,6 +71,8 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
           case Context::kCPUShared: {
 #if !defined(ANDROID) && !defined(__ANDROID__)
             ptr = new storage::CPUSharedStorageManager();
+#else
+            LOG(FATAL) << "Unimplemented device";
 #endif  // !defined(ANDROID) && !defined(__ANDROID__)
             break;
           }
@@ -149,13 +122,6 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
         return ptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = handle->ctx.dev_type == Context::kCPUPinned ||
-                 handle->ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(handle->ctx);
   manager->Alloc(handle);
   profiler_.OnAlloc(*handle);
 }
@@ -169,12 +135,6 @@ void StorageImpl::Free(Storage::Handle handle) {
         return nullptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(ctx);
   manager->Free(handle);
   profiler_.OnFree(handle);
 }
@@ -188,12 +148,6 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
         return nullptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(ctx);
   manager->DirectFree(handle);
   profiler_.OnFree(handle);
 }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services