You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by jx...@apache.org on 2017/12/01 18:47:32 UTC
[incubator-mxnet] branch master updated: fix race when temp space
is used in copy & fix instance overwrite in g2c (#8867)
This is an automated email from the ASF dual-hosted git repository.
jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new d7da05b fix race when temp space is used in copy & fix instance overwrite in g2c (#8867)
d7da05b is described below
commit d7da05b61adc9e4aba3e9995809b0d06965ae3bb
Author: Ziyue Huang <zy...@gmail.com>
AuthorDate: Sat Dec 2 02:47:22 2017 +0800
fix race when temp space is used in copy & fix instance overwrite in g2c (#8867)
* fix race when temp space is used in copy
* fix instance overwrite in g2c
* example of g2c
* address comments
---
.../{matrix_fact_parallel_model.py => model.py} | 0
...ix_factorization_model_parallel.py => train.py} | 2 +-
python/mxnet/module/executor_group.py | 2 +-
src/ndarray/ndarray.cc | 59 +++++++++++++---------
tests/python/gpu/test_operator_gpu.py | 2 +-
tests/python/unittest/test_module.py | 11 ++--
6 files changed, 45 insertions(+), 31 deletions(-)
diff --git a/example/model-parallel/matrix_factorization/matrix_fact_parallel_model.py b/example/model-parallel/matrix_factorization/model.py
similarity index 100%
rename from example/model-parallel/matrix_factorization/matrix_fact_parallel_model.py
rename to example/model-parallel/matrix_factorization/model.py
diff --git a/example/model-parallel/matrix_factorization/matrix_factorization_model_parallel.py b/example/model-parallel/matrix_factorization/train.py
similarity index 97%
rename from example/model-parallel/matrix_factorization/matrix_factorization_model_parallel.py
rename to example/model-parallel/matrix_factorization/train.py
index ab607f1..7a2073b 100644
--- a/example/model-parallel/matrix_factorization/matrix_factorization_model_parallel.py
+++ b/example/model-parallel/matrix_factorization/train.py
@@ -76,7 +76,7 @@ if __name__ == '__main__':
# construct the module
# map the ctx_group attribute to the context assignment
- group2ctxs={'dev1':mx.cpu(), 'dev2':[mx.gpu(i) for i in range(num_gpus)]}
+ group2ctxs={'dev1':[mx.cpu()]*num_gpus, 'dev2':[mx.gpu(i) for i in range(num_gpus)]}
mod = mx.module.Module(symbol=net, context=[mx.cpu()]*num_gpus, data_names=['user', 'item'],
label_names=['score'], group2ctxs=group2ctxs)
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index 2d680d4..6d24919 100755
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -106,7 +106,7 @@ def _prepare_group2ctxs(group2ctxs, ctx_len):
should be %d" % ctx_len
return group2ctxs
elif isinstance(group2ctxs, dict):
- ret = [{}] * ctx_len
+ ret = [{} for i in range(ctx_len)]
for k, v in group2ctxs.items():
ctxs = None
if isinstance(v, ctx.Context):
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index f06196d..4a1963a 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -454,25 +454,22 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
// Make a copy of an NDArray based on storage type
template<typename from_xpu, typename to_xpu>
-void CopyFromToImpl(const NDArray& from, const NDArray& to, RunContext rctx) {
+void CopyFromToImpl(const NDArray& from, const NDArray& to,
+ RunContext rctx, const std::vector<Resource>& requested) {
using namespace std;
using namespace mshadow;
// if storage type doesn't match, cast the storage first
- auto from_stype = from.storage_type();
- auto to_stype = to.storage_type();
+ const NDArrayStorageType from_stype = from.storage_type();
+ const NDArrayStorageType to_stype = to.storage_type();
CHECK(from_stype == kDefaultStorage
|| to_stype == kDefaultStorage
|| from_stype == to_stype)
<< "Copying ndarray of stype = " << from_stype
<< " to stype = " << to_stype << " is not supported";
- const auto from_ctx = from.ctx();
- const auto to_ctx = to.ctx();
+ const Context from_ctx = from.ctx();
+ const Context to_ctx = to.ctx();
bool is_train = Imperative::Get()->is_training();
- std::vector<Resource> requested;
- if (is_same<from_xpu, mshadow::gpu>::value && from_stype != to_stype) {
- requested.push_back(ResourceManager::Get()->Request(from_ctx,
- ResourceRequest(ResourceRequest::kTempSpace)));
- }
+
OpContext opctx{is_train,
rctx,
engine::CallbackOnComplete(),
@@ -518,43 +515,57 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority) {
CHECK(from.shape().ndim() != 0)
<< "source operands have zero dimension shape";
// important: callback must always capture by value
- int a = from.ctx().dev_mask();
- int b = to.ctx().dev_mask();
+ const Context from_ctx = from.ctx();
+ const int a = from_ctx.dev_mask();
+ const int b = to.ctx().dev_mask();
std::vector<Engine::VarHandle> const_vars;
if (from.var() != to.var()) const_vars.push_back(from.var());
+ const NDArrayStorageType from_stype = from.storage_type();
+ const NDArrayStorageType to_stype = to.storage_type();
+
+ std::vector<Engine::VarHandle> mutable_vars(1, to.var());
+
+ std::vector<Resource> requested;
+ if (a == gpu::kDevMask && from_stype != to_stype) {
+ Resource rsc = ResourceManager::Get()->Request(from_ctx,
+ ResourceRequest(ResourceRequest::kTempSpace));
+ requested.push_back(rsc);
+ mutable_vars.push_back(rsc.var);
+ }
+
if (a == cpu::kDevMask && b == cpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<cpu, cpu>(from, to, ctx);
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
on_complete();
- }, from.ctx(), const_vars, {to.var()},
+ }, from.ctx(), const_vars, mutable_vars,
FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU"));
} else {
#if MXNET_USE_CUDA
if (a == cpu::kDevMask && b == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<cpu, gpu>(from, to, ctx);
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<cpu, gpu>(from, to, ctx, requested);
ctx.get_stream<gpu>()->Wait();
on_complete();
- }, to.ctx(), const_vars, {to.var()},
+ }, to.ctx(), const_vars, mutable_vars,
FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU"));
} else if (a == gpu::kDevMask && b == cpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<gpu, cpu>(from, to, ctx);
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<gpu, cpu>(from, to, ctx, requested);
ctx.get_stream<gpu>()->Wait();
on_complete();
- }, from.ctx(), const_vars, {to.var()},
+ }, from.ctx(), const_vars, mutable_vars,
FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU"));
} else if (a == gpu::kDevMask && b == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<gpu, gpu>(from, to, ctx);
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<gpu, gpu>(from, to, ctx, requested);
ctx.get_stream<gpu>()->Wait();
on_complete();
- }, from.ctx(), const_vars, {to.var()},
+ }, from.ctx(), const_vars, mutable_vars,
from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
priority, PROFILER_MESSAGE("CopyGPU2GPU"));
} else {
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index fad5940..cecda21 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -37,7 +37,7 @@ from test_gluon_rnn import *
from test_sparse_ndarray import test_create_csr, test_create_row_sparse, test_sparse_nd_slice
from test_sparse_ndarray import test_create_sparse_nd_empty, test_create_sparse_nd_from_sparse
from test_sparse_ndarray import test_create_sparse_nd_from_dense, test_create_sparse_nd_infer_shape
-from test_sparse_ndarray import test_sparse_nd_check_format
+from test_sparse_ndarray import test_sparse_nd_check_format, test_sparse_nd_copy
from test_sparse_operator import *
from test_ndarray import *
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index ed5fe26..08302b8 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -71,7 +71,7 @@ def test_module_input_grads():
def test_module_ctx_group():
- def check_module_ctx_group(ctxs, group2ctxs):
+ def check_module_ctx_group(ctxs, group2ctxs, grad_ctxs=None):
with mx.AttrScope(ctx_group='dev1'):
a = mx.symbol.Variable('a')
a = a * 2
@@ -94,10 +94,13 @@ def test_module_ctx_group():
mod2.backward([mx.nd.ones(shape)])
mod2_input_grads = mod2.get_input_grads()
- assert np.all(mod1_input_grads[0].asnumpy() == mod2_input_grads[0].asnumpy())
- assert np.all(mod1_input_grads[1].asnumpy() == mod2_input_grads[1].asnumpy())
+ if grad_ctxs is not None:
+ assert(mod1_input_grads[0].context == grad_ctxs[0])
+ assert(mod1_input_grads[1].context == grad_ctxs[1])
+ assert(np.all(mod1_input_grads[0].asnumpy() == mod2_input_grads[0].asnumpy()))
+ assert(np.all(mod1_input_grads[1].asnumpy() == mod2_input_grads[1].asnumpy()))
- check_module_ctx_group([mx.cpu(0)], {'dev1': mx.cpu(1), 'dev2': mx.cpu(2)})
+ check_module_ctx_group([mx.cpu(0)], {'dev1': mx.cpu(1), 'dev2': mx.cpu(2)}, grad_ctxs=[mx.cpu(1), mx.cpu(2)])
check_module_ctx_group([mx.cpu(0), mx.cpu(1)],
[{'dev1': mx.cpu(2), 'dev2': mx.cpu(3)}, {'dev1': mx.cpu(4), 'dev2': mx.cpu(5)}])
check_module_ctx_group([mx.cpu(0), mx.cpu(1)], {'dev1': mx.cpu(2), 'dev2': mx.cpu(3)})
--
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].