You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ha...@apache.org on 2019/01/05 01:55:42 UTC
[incubator-mxnet] branch master updated: Less cudaGet/SetDevice
calls in Gluon execution (#13764)
This is an automated email from the ASF dual-hosted git repository.
haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 863fb86 Less cudaGet/SetDevice calls in Gluon execution (#13764)
863fb86 is described below
commit 863fb8626d47d472cb773acc2b4cd28c3f4caa48
Author: Przemyslaw Tredak <pt...@gmail.com>
AuthorDate: Fri Jan 4 17:55:23 2019 -0800
Less cudaGet/SetDevice calls in Gluon execution (#13764)
* Remove unnecessary cudaGetDevice/cudaSetDevice calls
* Fixes for the DeviceGuard
* Retrigger CI
* Fix for possible invalid device ordinal when using DeviceStore while
driver is unloading
* Fix for RTC when the driver API call is the first call
* Added DeviceStore to pooled engine
---
src/common/cuda_utils.h | 19 +++++++++++---
src/engine/stream_manager.h | 10 ++------
src/engine/threaded_engine_pooled.cc | 8 +++++-
src/kvstore/comm.h | 5 ++--
src/kvstore/comm_tree.h | 3 +--
src/storage/cpu_device_storage.h | 10 +++++---
src/storage/gpu_device_storage.h | 12 ++++++---
src/storage/naive_storage_manager.h | 6 ++---
src/storage/pinned_memory_storage.h | 12 ++++++---
src/storage/pooled_storage_manager.h | 4 +++
src/storage/storage.cc | 50 ++----------------------------------
11 files changed, 59 insertions(+), 80 deletions(-)
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 047edde..0dd9d2d 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -286,22 +286,35 @@ inline DType __device__ CudaMin(DType a, DType b) {
class DeviceStore {
public:
/*! \brief default constructor- only optionally restores previous device */
- explicit DeviceStore(bool restore = true) : restore_(restore) {
+ explicit DeviceStore(int requested_device = -1, bool restore = true) :
+ restore_device_(-1),
+ current_device_(requested_device),
+ restore_(restore) {
if (restore_)
CUDA_CALL(cudaGetDevice(&restore_device_));
+ if (requested_device != restore_device_) {
+ SetDevice(requested_device);
+ }
}
~DeviceStore() {
- if (restore_)
+ if (restore_ &&
+ current_device_ != restore_device_ &&
+ current_device_ != -1 &&
+ restore_device_ != -1)
CUDA_CALL(cudaSetDevice(restore_device_));
}
void SetDevice(int device) {
- CUDA_CALL(cudaSetDevice(device));
+ if (device != -1) {
+ CUDA_CALL(cudaSetDevice(device));
+ current_device_ = device;
+ }
}
private:
int restore_device_;
+ int current_device_;
bool restore_;
};
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index d4ac042..516e04b 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -65,9 +65,6 @@ template <std::size_t kNumGpus, std::size_t kStreams>
RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
Context const& ctx) {
RunContext ret;
-#if MXNET_USE_CUDA
- mxnet::common::cuda::DeviceStore device_store;
-#endif
switch (ctx.dev_mask()) {
case cpu::kDevMask:
ret = RunContext{ctx, nullptr};
@@ -75,11 +72,11 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
case gpu::kDevMask: {
#if MXNET_USE_CUDA
std::size_t use_counter;
- device_store.SetDevice(ctx.dev_id);
{
std::lock_guard<std::mutex> lock{mutex_};
auto&& counter = gpu_cnt_.at(ctx.dev_id);
if (counter == -1) {
+ mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
for (auto&& i : gpu_streams_.at(ctx.dev_id)) {
i = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0, ctx.dev_id);
}
@@ -104,19 +101,16 @@ template <std::size_t kNumGpus, std::size_t kStreams>
RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
Context const& ctx) {
RunContext ret;
-#if MXNET_USE_CUDA
- mxnet::common::cuda::DeviceStore device_store;
-#endif
switch (ctx.dev_mask()) {
case cpu::kDevMask:
ret = RunContext{ctx, nullptr};
break;
case gpu::kDevMask: {
#if MXNET_USE_CUDA
- device_store.SetDevice(ctx.dev_id);
{
std::lock_guard<std::mutex> lock{mutex_};
if (gpu_io_streams_.at(ctx.dev_id) == nullptr) {
+ mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
}
}
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index 1abb82f..c6eb995 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -31,6 +31,9 @@
#include "./threaded_engine.h"
#include "./thread_pool.h"
#include "./stream_manager.h"
+#if MXNET_USE_CUDA
+#include "../common/cuda_utils.h"
+#endif
namespace mxnet {
namespace engine {
@@ -130,10 +133,13 @@ class ThreadedEnginePooled : public ThreadedEngine {
* \param opr_block The operator block.
*/
void DoExecute(OprBlock* opr_block) {
+#if MXNET_USE_CUDA
+ mxnet::common::cuda::DeviceStore device_store(-1, false);
+#endif
assert(opr_block->wait.load() == 0);
if (opr_block->ctx.dev_mask() == gpu::kDevMask) {
#if MXNET_USE_CUDA
- CUDA_CALL(cudaSetDevice(opr_block->ctx.dev_id));
+ device_store.SetDevice(opr_block->ctx.dev_id);
#else // MXNET_USE_CUDA
LOG(FATAL) << "Please compile with CUDA enabled";
#endif // MXNET_USE_CUDA
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 7090aaf..08f6155 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -724,10 +724,9 @@ class CommDevice : public Comm {
int enabled = 0;
std::vector<int> p2p(n*n);
- // Restores active device to what it was before EnableP2P
- mxnet::common::cuda::DeviceStore device_store;
for (int i = 0; i < n; ++i) {
- device_store.SetDevice(gpus[i]);
+ // Restores active device to what it was before EnableP2P
+ mxnet::common::cuda::DeviceStore device_store(gpus[i]);
for (int j = 0; j < n; j++) {
int access;
cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index e3b2ad7..b62228c 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -339,9 +339,8 @@ class CommDeviceTree : public CommDevice {
int n = static_cast<int>(gpus.size());
int enabled = 0;
std::vector<int> p2p(n*n);
- mxnet::common::cuda::DeviceStore device_store;
for (int i = 0; i < n; ++i) {
- device_store.SetDevice(gpus[i]);
+ mxnet::common::cuda::DeviceStore device_store(gpus[i]);
for (int j = 0; j < n; j++) {
int access;
cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 43e98fe..25ad61e 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -43,12 +43,12 @@ class CPUDeviceStorage {
* \param size Size to allocate.
* \return Pointer to the storage.
*/
- inline static void* Alloc(size_t size);
+ inline static void* Alloc(Storage::Handle* handle);
/*!
* \brief Deallocation.
* \param ptr Pointer to deallocate.
*/
- inline static void Free(void* ptr);
+ inline static void Free(Storage::Handle handle);
private:
/*!
@@ -63,7 +63,8 @@ class CPUDeviceStorage {
#endif
}; // class CPUDeviceStorage
-inline void* CPUDeviceStorage::Alloc(size_t size) {
+inline void* CPUDeviceStorage::Alloc(Storage::Handle* handle) {
+ const size_t size = handle->size;
void* ptr;
#if _MSC_VER
ptr = _aligned_malloc(size, alignment_);
@@ -75,7 +76,8 @@ inline void* CPUDeviceStorage::Alloc(size_t size) {
return ptr;
}
-inline void CPUDeviceStorage::Free(void* ptr) {
+inline void CPUDeviceStorage::Free(Storage::Handle handle) {
+ void * ptr = handle.dptr;
#if _MSC_VER
_aligned_free(ptr);
#else
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 435c7e8..562badb 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -46,17 +46,19 @@ class GPUDeviceStorage {
* \param size Size to allocate.
* \return Pointer to the storage.
*/
- inline static void* Alloc(size_t size);
+ inline static void* Alloc(Storage::Handle* handle);
/*!
* \brief Deallocation.
* \param ptr Pointer to deallocate.
*/
- inline static void Free(void* ptr);
+ inline static void Free(Storage::Handle handle);
}; // class GPUDeviceStorage
-inline void* GPUDeviceStorage::Alloc(size_t size) {
+inline void* GPUDeviceStorage::Alloc(Storage::Handle* handle) {
+ const size_t size = handle->size;
void* ret = nullptr;
#if MXNET_USE_CUDA
+ mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
#if MXNET_USE_NCCL
std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
#endif // MXNET_USE_NCCL
@@ -69,8 +71,10 @@ inline void* GPUDeviceStorage::Alloc(size_t size) {
return ret;
}
-inline void GPUDeviceStorage::Free(void* ptr) {
+inline void GPUDeviceStorage::Free(Storage::Handle handle) {
#if MXNET_USE_CUDA
+ void * ptr = handle.dptr;
+ mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
#if MXNET_USE_NCCL
std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
#endif // MXNET_USE_NCCL
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index b05b242..55112b5 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -49,7 +49,7 @@ class NaiveStorageManager final : public StorageManager {
void Free(Storage::Handle handle) override;
void DirectFree(Storage::Handle handle) override {
- DeviceStorage::Free(handle.dptr);
+ DeviceStorage::Free(handle);
}
private:
@@ -58,12 +58,12 @@ class NaiveStorageManager final : public StorageManager {
template <class DeviceStorage>
void NaiveStorageManager<DeviceStorage>::Alloc(Storage::Handle* handle) {
- handle->dptr = DeviceStorage::Alloc(handle->size);
+ handle->dptr = DeviceStorage::Alloc(handle);
}
template <class DeviceStorage>
void NaiveStorageManager<DeviceStorage>::Free(Storage::Handle handle) {
- DeviceStorage::Free(handle.dptr);
+ DeviceStorage::Free(handle);
}
} // namespace storage
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index e3fec2f..c4ababb 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -41,29 +41,33 @@ class PinnedMemoryStorage {
* \param size Size to allocate.
* \return Pointer to the storage.
*/
- inline static void* Alloc(size_t size);
+ inline static void* Alloc(Storage::Handle* handle);
/*!
* \brief Deallocation.
* \param ptr Pointer to deallocate.
*/
- inline static void Free(void* ptr);
+ inline static void Free(Storage::Handle handle);
};
-inline void* PinnedMemoryStorage::Alloc(size_t size) {
+inline void* PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
void* ret = nullptr;
+ const size_t size = handle->size;
#if MXNET_USE_NCCL
std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
#endif
+ mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
// make the memory available across all devices
CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable));
return ret;
}
-inline void PinnedMemoryStorage::Free(void* ptr) {
+inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
+ void * ptr = handle.dptr;
#if MXNET_USE_NCCL
std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
#endif
+ mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
cudaError_t err = cudaFreeHost(ptr);
// ignore unloading error, as memory has already been recycled
if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index cade8d9..c407a9f 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -84,6 +84,7 @@ class GPUPooledStorageManager final : public StorageManager {
private:
void DirectFreeNoLock(Storage::Handle handle) {
+ mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
cudaError_t err = cudaFree(handle.dptr);
size_t size = RoundAllocSize(handle.size);
// ignore unloading error, as memory has already been recycled
@@ -132,6 +133,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
size_t size = RoundAllocSize(handle->size);
auto&& reuse_it = memory_pool_.find(size);
if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
+ mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
size_t free, total;
cudaMemGetInfo(&free, &total);
if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
@@ -252,6 +254,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
}
void DirectFreeNoLock(Storage::Handle handle) {
+ mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
cudaError_t err = cudaFree(handle.dptr);
size_t size = get_size(get_bucket(handle.size));
// ignore unloading error, as memory has already been recycled
@@ -288,6 +291,7 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
size_t size = get_size(bucket);
auto&& reuse_pool = memory_pool_[bucket];
if (reuse_pool.size() == 0) {
+ mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
size_t free, total;
cudaMemGetInfo(&free, &total);
if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index c7100a4..911d30c 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -48,35 +48,6 @@ class StorageImpl : public Storage {
static int num_gpu_device;
#endif // MXNET_USE_CUDA
- static void ActivateDevice(Context ctx) {
- switch (ctx.dev_type) {
- case Context::kCPU:
- break;
- case Context::kCPUPinned:
-#if MXNET_USE_CUDA
- if (num_gpu_device > 0) {
- CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
- }
-#endif // MXNET_USE_CUDA
- break;
- case Context::kCPUShared: {
-#if defined(ANDROID) || defined(__ANDROID__)
- LOG(FATAL) << "Unimplemented device";
-#endif // defined(ANDROID) || defined(__ANDROID__)
- }
- break;
- case Context::kGPU: {
-#if MXNET_USE_CUDA
- if (num_gpu_device > 0) {
- CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
- }
-#endif // MXNET_USE_CUDA
- break;
- }
- default:
- LOG(FATAL) << "Unimplemented device";
- }
- }
// internal storage managers
std::array<common::LazyAllocArray<storage::StorageManager>,
kMaxNumberOfDevices> storage_managers_;
@@ -100,6 +71,8 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
case Context::kCPUShared: {
#if !defined(ANDROID) && !defined(__ANDROID__)
ptr = new storage::CPUSharedStorageManager();
+#else
+ LOG(FATAL) << "Unimplemented device";
#endif // !defined(ANDROID) && !defined(__ANDROID__)
break;
}
@@ -149,13 +122,6 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
return ptr;
});
-#if MXNET_USE_CUDA
- // Will restore gpu device to before ActivateDevice if necessary
- bool restore = handle->ctx.dev_type == Context::kCPUPinned ||
- handle->ctx.dev_type == Context::kGPU;
- mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
- this->ActivateDevice(handle->ctx);
manager->Alloc(handle);
profiler_.OnAlloc(*handle);
}
@@ -169,12 +135,6 @@ void StorageImpl::Free(Storage::Handle handle) {
return nullptr;
});
-#if MXNET_USE_CUDA
- // Will restore gpu device to before ActivateDevice if necessary
- bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
- mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
- this->ActivateDevice(ctx);
manager->Free(handle);
profiler_.OnFree(handle);
}
@@ -188,12 +148,6 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
return nullptr;
});
-#if MXNET_USE_CUDA
- // Will restore gpu device to before ActivateDevice if necessary
- bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
- mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
- this->ActivateDevice(ctx);
manager->DirectFree(handle);
profiler_.OnFree(handle);
}