You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by pt...@apache.org on 2022/08/04 08:42:05 UTC
[incubator-mxnet] branch master updated: [BUGFIX] Reenable fwd conv engine 5 on test_group_conv2d_16c (#21104)
This is an automated email from the ASF dual-hosted git repository.
ptrendx pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 9975ab41a6 [BUGFIX] Reenable fwd conv engine 5 on test_group_conv2d_16c (#21104)
9975ab41a6 is described below
commit 9975ab41a6ecbc8338213315405c780c6444cb99
Author: Dick Carter <dc...@nvidia.com>
AuthorDate: Thu Aug 4 01:41:47 2022 -0700
[BUGFIX] Reenable fwd conv engine 5 on test_group_conv2d_16c (#21104)
* Reenable fwd conv engine 5 on test_group_conv2d_16c
* Test conv plan fix devel
* Fix clang format
* More clang format fixes
* switch to using std::vector::data()
---
src/common/cuda/cudnn_cxx.cc | 2 +-
src/common/cuda/cudnn_cxx.h | 4 +-
src/operator/cudnn_ops.cc | 148 ++++++++++++++++++++++++++++++------
src/operator/cudnn_ops.h | 83 +++++++++++++++++---
tests/python/unittest/test_gluon.py | 1 -
5 files changed, 201 insertions(+), 37 deletions(-)
diff --git a/src/common/cuda/cudnn_cxx.cc b/src/common/cuda/cudnn_cxx.cc
index 2259c85dee..d2f36fd98d 100644
--- a/src/common/cuda/cudnn_cxx.cc
+++ b/src/common/cuda/cudnn_cxx.cc
@@ -63,7 +63,7 @@ void SetAttr(const Descriptor& desc,
std::vector<cudnnBackendDescriptor_t> raw(val.size());
std::transform(val.begin(), val.end(), raw.begin(), [](const Descriptor& d) { return d.get(); });
CUDNN_CALL(cudnnBackendSetAttribute(
- desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, raw.size(), &raw[0]));
+ desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, raw.size(), raw.data()));
}
Descriptor GetAttr(const Descriptor& desc,
diff --git a/src/common/cuda/cudnn_cxx.h b/src/common/cuda/cudnn_cxx.h
index 07cd93d67a..5e5dfc90b7 100644
--- a/src/common/cuda/cudnn_cxx.h
+++ b/src/common/cuda/cudnn_cxx.h
@@ -162,14 +162,14 @@ void SetAttr(const Descriptor& desc, cudnnBackendAttributeName_t name, T val) {
template <typename T>
void SetAttr(const Descriptor& desc, cudnnBackendAttributeName_t name, const std::vector<T>& val) {
- CUDNN_CALL(cudnnBackendSetAttribute(desc.get(), name, AttrType<T>::type, val.size(), &val[0]));
+ CUDNN_CALL(cudnnBackendSetAttribute(desc.get(), name, AttrType<T>::type, val.size(), val.data()));
}
template <typename T, size_t N>
void SetAttr(const Descriptor& desc,
cudnnBackendAttributeName_t name,
const std::array<T, N>& val) {
- CUDNN_CALL(cudnnBackendSetAttribute(desc.get(), name, AttrType<T>::type, val.size(), &val[0]));
+ CUDNN_CALL(cudnnBackendSetAttribute(desc.get(), name, AttrType<T>::type, val.size(), val.data()));
}
inline void SetAttrs(const Descriptor& desc) {}
diff --git a/src/operator/cudnn_ops.cc b/src/operator/cudnn_ops.cc
index 2b99dc7fd8..aa002df016 100644
--- a/src/operator/cudnn_ops.cc
+++ b/src/operator/cudnn_ops.cc
@@ -241,6 +241,20 @@ Descriptor MakeConvFwdOp(const Descriptor& conv,
return ret;
}
+Descriptor Conv::MakeConvFwdOp(const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& w,
+ const TBlob& y) {
+ auto dtype = static_cast<mshadow::TypeFlag>(x.type_flag_);
+ auto conv = MakeConvDesc(param, dtype);
+ auto li = GetLayoutInfo(static_cast<mshadow::LayoutFlag>(param.layout.value()));
+ auto x_desc = MakeTensorDesc(ID_X, x, li, true, false);
+ auto w_desc = MakeTensorDesc(ID_W, w, li, true, false);
+ auto y_desc = MakeTensorDesc(ID_Y, y, li, true, false);
+ return cudnn::MakeConvFwdOp(conv, x_desc, w_desc, y_desc, param.add_to);
+}
+
Descriptor MakeConvDgradOp(const Descriptor& conv,
const Descriptor& w,
const Descriptor& dy,
@@ -272,6 +286,20 @@ Descriptor MakeConvDgradOp(const Descriptor& conv,
return ret;
}
+Descriptor ConvDgrad::MakeConvDgradOp(const OpContext& ctx,
+ const Param& param,
+ const TBlob& w,
+ const TBlob& dy,
+ const TBlob& dx) {
+ auto dtype = static_cast<mshadow::TypeFlag>(w.type_flag_);
+ auto conv = MakeConvDesc(param, dtype);
+ auto li = GetLayoutInfo(static_cast<mshadow::LayoutFlag>(param.layout.value()));
+ auto w_desc = MakeTensorDesc(ID_W, w, li, true, false);
+ auto dy_desc = MakeTensorDesc(ID_DY, dy, li, true, false);
+ auto dx_desc = MakeTensorDesc(ID_DX, dx, li, true, false);
+ return cudnn::MakeConvDgradOp(conv, w_desc, dy_desc, dx_desc, param.add_to);
+}
+
Descriptor MakeConvWgradOp(const Descriptor& conv,
const Descriptor& x,
const Descriptor& dy,
@@ -303,6 +331,20 @@ Descriptor MakeConvWgradOp(const Descriptor& conv,
return ret;
}
+Descriptor ConvWgrad::MakeConvWgradOp(const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& dy,
+ const TBlob& dw) {
+ auto dtype = static_cast<mshadow::TypeFlag>(x.type_flag_);
+ auto conv = MakeConvDesc(param, dtype);
+ auto li = GetLayoutInfo(static_cast<mshadow::LayoutFlag>(param.layout.value()));
+ auto x_desc = MakeTensorDesc(ID_X, x, li, true, false);
+ auto dy_desc = MakeTensorDesc(ID_DY, dy, li, true, false);
+ auto dw_desc = MakeTensorDesc(ID_DW, dw, li, true, false);
+ return cudnn::MakeConvWgradOp(conv, x_desc, dy_desc, dw_desc, param.add_to);
+}
+
Descriptor MakeOpGraph(cudnnHandle_t handle, const std::vector<Descriptor>& ops) {
return MakeFinalized(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR,
CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
@@ -311,6 +353,44 @@ Descriptor MakeOpGraph(cudnnHandle_t handle, const std::vector<Descriptor>& ops)
ops);
}
+Descriptor MakeOpGraph(cudnnHandle_t handle, Descriptor op) {
+ std::vector<Descriptor> ops;
+ ops.push_back(std::move(op));
+ return MakeOpGraph(handle, ops);
+}
+
+Descriptor ClonePlan(cudnnHandle_t handle, Descriptor op_graph, const Descriptor& plan) {
+ auto cfg =
+ GetAttr(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG, CUDNN_BACKEND_ENGINECFG_DESCRIPTOR);
+ auto engine = GetAttr(cfg, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+ auto engine_idx = GetAttr<int64_t>(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX);
+
+ auto choices = GetSomeAttrs(CUDNN_KNOB_TYPE_COUNTS,
+ cfg,
+ CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
+ CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR);
+
+ auto cloned_engine = MakeFinalized(CUDNN_BACKEND_ENGINE_DESCRIPTOR,
+ CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
+ engine_idx,
+ CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
+ op_graph);
+
+ auto cloned_cfg = MakeFinalized(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
+ CUDNN_ATTR_ENGINECFG_ENGINE,
+ cloned_engine,
+ CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
+ choices);
+
+ auto cloned_plan = cudnn_cxx::Make(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
+ CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
+ handle,
+ CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
+ cloned_cfg);
+ CUDNN_CALL(cudnnBackendFinalize(cloned_plan.get()));
+ return cloned_plan;
+}
+
ConvParam::ConvParam(const ConvolutionParam& p, bool add_to)
: kernel(p.kernel),
stride(p.stride),
@@ -476,9 +556,7 @@ Descriptor SelectPlan(const OpContext& ctx,
int64_t out_size,
const std::string& excl_engines_var) {
auto s = ctx.get_stream<gpu>();
- std::vector<Descriptor> ops;
- ops.push_back(std::move(op));
- auto op_graph = MakeOpGraph(s->dnn_handle_, ops);
+ auto op_graph = MakeOpGraph(s->dnn_handle_, std::move(op));
int verbose = dmlc::GetEnv("MXNET_CUDNN_ALGO_VERBOSE_LEVEL", 0);
if (verbose > 0)
@@ -592,12 +670,7 @@ cudnn_cxx::Descriptor Conv::Make(const OpContext& ctx,
const TBlob& x,
const TBlob& w,
const TBlob& y) {
- auto conv = MakeConvDesc(param, static_cast<mshadow::TypeFlag>(x.type_flag_));
- auto li = GetLayoutInfo(static_cast<mshadow::LayoutFlag>(param.layout.value()));
- auto x_desc = MakeTensorDesc(ID_X, x, li, true, false);
- auto w_desc = MakeTensorDesc(ID_W, w, li, true, false);
- auto y_desc = MakeTensorDesc(ID_Y, y, li, true, false);
- auto conv_fwd = MakeConvFwdOp(conv, x_desc, w_desc, y_desc, param.add_to);
+ auto conv_fwd = MakeConvFwdOp(ctx, param, x, w, y);
auto make_op_str = [¶m, &x]() {
std::ostringstream ss;
@@ -619,6 +692,19 @@ cudnn_cxx::Descriptor Conv::Make(const OpContext& ctx,
"MXNET_CUDNN_DISABLED_CONV_FWD_ENGINES");
}
+cudnn_cxx::Descriptor Conv::Clone(const cudnn_cxx::Descriptor& plan,
+ const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& w,
+ const TBlob& y) {
+ auto conv_fwd = MakeConvFwdOp(ctx, param, x, w, y);
+ auto handle = ctx.get_stream<gpu>()->dnn_handle_;
+ auto op_graph = MakeOpGraph(handle, std::move(conv_fwd));
+ auto cloned_plan = ClonePlan(handle, std::move(op_graph), plan);
+ return cloned_plan;
+}
+
void Conv::Exec(const cudnn_cxx::Descriptor& plan,
const OpContext& ctx,
const TBlob& x,
@@ -645,12 +731,7 @@ cudnn_cxx::Descriptor ConvDgrad::Make(const OpContext& ctx,
const TBlob& w,
const TBlob& dy,
const TBlob& dx) {
- auto conv = MakeConvDesc(param, static_cast<mshadow::TypeFlag>(w.type_flag_));
- auto li = GetLayoutInfo(static_cast<mshadow::LayoutFlag>(param.layout.value()));
- auto w_desc = MakeTensorDesc(ID_W, w, li, true, false);
- auto dy_desc = MakeTensorDesc(ID_DY, dy, li, true, false);
- auto dx_desc = MakeTensorDesc(ID_DX, dx, li, true, false);
- auto dgrad = MakeConvDgradOp(conv, w_desc, dy_desc, dx_desc, param.add_to);
+ auto conv_dgrad = MakeConvDgradOp(ctx, param, w, dy, dx);
auto make_op_str = [¶m, &dx]() {
std::ostringstream ss;
@@ -663,7 +744,7 @@ cudnn_cxx::Descriptor ConvDgrad::Make(const OpContext& ctx,
return SelectPlan(ctx,
param,
- std::move(dgrad),
+ std::move(conv_dgrad),
kMaxDgradFallbacks,
make_op_str,
ids,
@@ -672,6 +753,19 @@ cudnn_cxx::Descriptor ConvDgrad::Make(const OpContext& ctx,
"MXNET_CUDNN_DISABLED_CONV_DGRAD_ENGINES");
}
+cudnn_cxx::Descriptor ConvDgrad::Clone(const cudnn_cxx::Descriptor& plan,
+ const OpContext& ctx,
+ const Param& param,
+ const TBlob& w,
+ const TBlob& dy,
+ const TBlob& dx) {
+ auto conv_dgrad = MakeConvDgradOp(ctx, param, w, dy, dx);
+ auto handle = ctx.get_stream<gpu>()->dnn_handle_;
+ auto op_graph = MakeOpGraph(handle, std::move(conv_dgrad));
+ auto cloned_plan = ClonePlan(handle, std::move(op_graph), plan);
+ return cloned_plan;
+}
+
void ConvDgrad::Exec(const cudnn_cxx::Descriptor& plan,
const OpContext& ctx,
const TBlob& w,
@@ -698,12 +792,7 @@ cudnn_cxx::Descriptor ConvWgrad::Make(const OpContext& ctx,
const TBlob& x,
const TBlob& dy,
const TBlob& dw) {
- auto conv = MakeConvDesc(param, static_cast<mshadow::TypeFlag>(x.type_flag_));
- auto li = GetLayoutInfo(static_cast<mshadow::LayoutFlag>(param.layout.value()));
- auto x_desc = MakeTensorDesc(ID_X, x, li, true, false);
- auto dy_desc = MakeTensorDesc(ID_DY, dy, li, true, false);
- auto dw_desc = MakeTensorDesc(ID_DW, dw, li, true, false);
- auto wgrad = MakeConvWgradOp(conv, x_desc, dy_desc, dw_desc, param.add_to);
+ auto conv_wgrad = MakeConvWgradOp(ctx, param, x, dy, dw);
auto make_op_str = [¶m, &x]() {
std::ostringstream ss;
@@ -716,7 +805,7 @@ cudnn_cxx::Descriptor ConvWgrad::Make(const OpContext& ctx,
return SelectPlan(ctx,
param,
- std::move(wgrad),
+ std::move(conv_wgrad),
kMaxWgradFallbacks,
make_op_str,
ids,
@@ -725,6 +814,19 @@ cudnn_cxx::Descriptor ConvWgrad::Make(const OpContext& ctx,
"MXNET_CUDNN_DISABLED_CONV_WGRAD_ENGINES");
}
+cudnn_cxx::Descriptor ConvWgrad::Clone(const cudnn_cxx::Descriptor& plan,
+ const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& dy,
+ const TBlob& dw) {
+ auto conv_wgrad = MakeConvWgradOp(ctx, param, x, dy, dw);
+ auto handle = ctx.get_stream<gpu>()->dnn_handle_;
+ auto op_graph = MakeOpGraph(handle, std::move(conv_wgrad));
+ auto cloned_plan = ClonePlan(handle, std::move(op_graph), plan);
+ return cloned_plan;
+}
+
void ConvWgrad::Exec(const cudnn_cxx::Descriptor& plan,
const OpContext& ctx,
const TBlob& x,
diff --git a/src/operator/cudnn_ops.h b/src/operator/cudnn_ops.h
index 5f24a7e8c5..f5c5861ac7 100644
--- a/src/operator/cudnn_ops.h
+++ b/src/operator/cudnn_ops.h
@@ -112,6 +112,13 @@ struct LayoutInfo {
LayoutInfo GetLayoutInfo(mshadow::LayoutFlag layout);
+cudnn_cxx::Descriptor MakeOpGraph(cudnnHandle_t handle, cudnn_cxx::Descriptor op);
+
+// Make a copy of an existing execution plan with a new cuDNN handle. Op graph re-supplied.
+cudnn_cxx::Descriptor ClonePlan(cudnnHandle_t handle,
+ cudnn_cxx::Descriptor op_graph,
+ const cudnn_cxx::Descriptor& plan);
+
TShape ExpandChannelDims(mshadow::LayoutFlag layout, int c);
void MaybeLogSelectedPlan(const cudnn_cxx::Descriptor& plan);
@@ -121,25 +128,39 @@ void MaybeLogSelectedPlan(const cudnn_cxx::Descriptor& plan);
// Op::Param - a type, collecting all data, required to create cuDNN descriptor(s), but not needed
// for execution.
// Op::MakeKey() - a static function, which maps its arguments to a tuple - a key in the op cache.
-// Op::Make() - a static function, creating the necessary cuDNN descriptor.
-// Op::Exec() - a static function, calling cudnnBackendExecute() with the prepared descriptor and
+// Op::Make() - a static function, creating all necessary cuDNN descriptors.
+// Op::Clone() - a static function, creating a copy of the op's descriptors with a new cudNN handle.
+// Op::Exec() - a static function, calling cudnnBackendExecute() with the prepared descriptor(s) and
// the passed arguments.
template <typename Op, typename... Args>
bool Exec(const OpContext& ctx, const typename Op::Param& param, Args&&... args) {
auto key = std::tuple_cat(std::make_tuple(ctx.run_ctx.ctx.dev_id),
Op::MakeKey(param, std::forward<Args>(args)...));
- static std::unordered_map<decltype(key), cudnn_cxx::Descriptor> op_map;
static std::mutex mx;
std::unique_lock<std::mutex> lk(mx);
- auto it = op_map.find(key);
- if (it == op_map.end()) {
- auto op = Op::Make(ctx, param, std::forward<Args>(args)...);
- it = op_map.emplace(key, std::move(op)).first;
- }
+ static std::unordered_multimap<decltype(key), const cudnn_cxx::Descriptor> op_map;
+ auto match_it = [&]() {
+ // Some cuDNN Op implementations require that the thread's cuDNN handle
+ // (used in cudnnBackendExecute()) matches the one used in making the plan.
+ const bool ignore_handles = false;
+ auto range = op_map.equal_range(key);
+ auto handle = ctx.get_stream<gpu>()->dnn_handle_;
+ for (auto it = range.first; it != range.second; ++it) {
+ if (ignore_handles || handle == cudnn_cxx::GetAttr<cudnnHandle_t>(
+ it->second, CUDNN_ATTR_EXECUTION_PLAN_HANDLE)) {
+ return it;
+ }
+ }
+ // No Op exists with this handle. Make a new op, cloning from an existing op if possible.
+ auto op = (range.first == range.second) ?
+ Op::Make(ctx, param, std::forward<Args>(args)...) :
+ Op::Clone(range.first->second, ctx, param, std::forward<Args>(args)...);
+ return op_map.emplace(key, std::move(op));
+ }();
lk.unlock();
- if (!it->second)
+ if (!match_it->second)
return false;
- Op::Exec(it->second, ctx, std::forward<Args>(args)...);
+ Op::Exec(match_it->second, ctx, std::forward<Args>(args)...);
return true;
}
@@ -189,11 +210,25 @@ struct Conv {
const TBlob& w,
const TBlob& y);
+ static cudnn_cxx::Descriptor Clone(const cudnn_cxx::Descriptor& plan,
+ const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& w,
+ const TBlob& y);
+
static void Exec(const cudnn_cxx::Descriptor& plan,
const OpContext& ctx,
const TBlob& x,
const TBlob& w,
const TBlob& y);
+
+ private:
+ static cudnn_cxx::Descriptor MakeConvFwdOp(const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& w,
+ const TBlob& y);
};
struct ConvDgrad {
@@ -223,11 +258,25 @@ struct ConvDgrad {
const TBlob& dy,
const TBlob& dx);
+ static cudnn_cxx::Descriptor Clone(const cudnn_cxx::Descriptor& plan,
+ const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& w,
+ const TBlob& y);
+
static void Exec(const cudnn_cxx::Descriptor& plan,
const OpContext& ctx,
const TBlob& w,
const TBlob& dy,
const TBlob& dx);
+
+ private:
+ static cudnn_cxx::Descriptor MakeConvDgradOp(const OpContext& ctx,
+ const Param& param,
+ const TBlob& w,
+ const TBlob& dy,
+ const TBlob& dx);
};
struct ConvWgrad {
@@ -257,11 +306,25 @@ struct ConvWgrad {
const TBlob& dy,
const TBlob& dw);
+ static cudnn_cxx::Descriptor Clone(const cudnn_cxx::Descriptor& plan,
+ const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& w,
+ const TBlob& y);
+
static void Exec(const cudnn_cxx::Descriptor& plan,
const OpContext& ctx,
const TBlob& x,
const TBlob& dy,
const TBlob& dw);
+
+ private:
+ static cudnn_cxx::Descriptor MakeConvWgradOp(const OpContext& ctx,
+ const Param& param,
+ const TBlob& x,
+ const TBlob& dy,
+ const TBlob& dw);
};
bool LegacyAddBias(const OpContext& ctx, const LayoutInfo& li, const TBlob& y, const TBlob& b);
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 33fd48a256..591f9690a7 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -1834,7 +1834,6 @@ def test_conv2d_16c(chn_num, kernel):
@use_np
@pytest.mark.parametrize('grp', [16])
@pytest.mark.parametrize('kernel_size', [1, 3])
-@with_environment('MXNET_CUDNN_DISABLED_CONV_FWD_ENGINES', '5') # eng:5 causes test failure on M60
def test_group_conv2d_16c(grp, kernel_size):
input_size_list = onp.random.randint(low=3, high=65, size=10).tolist()
batch_size = 4