You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2017/11/23 23:40:28 UTC
[incubator-mxnet] branch v1.0.0 updated: Fix weird hang bug due to
cuInit sometimes calls fork (#8790)
This is an automated email from the ASF dual-hosted git repository.
zhasheng pushed a commit to branch v1.0.0
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/v1.0.0 by this push:
new 7b40c03 Fix weird hang bug due to cuInit sometimes calls fork (#8790)
7b40c03 is described below
commit 7b40c031c0147310a5adb0646052508dd4534647
Author: Eric Junyuan Xie <pi...@users.noreply.github.com>
AuthorDate: Thu Nov 23 12:53:55 2017 -0800
Fix weird hang bug due to cuInit sometimes calls fork (#8790)
---
src/engine/threaded_engine_perdevice.cc | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index c01de75..28bc92f 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -55,7 +55,6 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
#ifndef _WIN32
pthread_atfork(
[]() {
- Engine::Get()->WaitForAll();
Engine::Get()->Stop();
},
[]() {
@@ -71,10 +70,10 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
#endif
}
~ThreadedEnginePerDevice() noexcept(false) {
- this->Stop();
+ this->StopNoWait();
}
- void Stop() override {
+ void StopNoWait() {
SignalQueuesForKill();
gpu_normal_workers_.Clear();
gpu_copy_workers_.Clear();
@@ -82,7 +81,14 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
cpu_priority_worker_.reset(nullptr);
}
+ void Stop() override {
+ if (is_worker_) return;
+ WaitForAll();
+ StopNoWait();
+ }
+
void Start() override {
+ if (is_worker_) return;
gpu_worker_nthreads_ = common::GetNumThreadPerGPU();
cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
// create CPU task
@@ -196,6 +202,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
~ThreadWorkerBlock() noexcept(false) {}
};
+ /*! \brief whether this is a worker thread. */
+ static MX_THREAD_LOCAL bool is_worker_;
/*! \brief number of concurrent thread cpu worker uses */
int cpu_worker_nthreads_;
/*! \brief number of concurrent thread each gpu worker uses */
@@ -219,6 +227,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
bool is_copy_worker,
ThreadWorkerBlock<type> *block,
std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
+ this->is_worker_ = true;
#if MXNET_USE_CUDA
mshadow::Stream<gpu> *stream;
do {
@@ -251,6 +260,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
template<dmlc::ConcurrentQueueType type>
inline void CPUWorker(Context ctx,
ThreadWorkerBlock<type> *block) {
+ this->is_worker_ = true;
auto* task_queue = &(block->task_queue);
RunContext run_ctx{ctx, nullptr};
// execute task
@@ -303,5 +313,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
Engine *CreateThreadedEnginePerDevice() {
return new ThreadedEnginePerDevice();
}
+
+MX_THREAD_LOCAL bool ThreadedEnginePerDevice::is_worker_ = false;
+
} // namespace engine
} // namespace mxnet
--
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].