You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by jx...@apache.org on 2018/01/02 18:47:44 UTC
[incubator-mxnet] branch master updated: 1 d conv with cudnn (#9184)
This is an automated email from the ASF dual-hosted git repository.
jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 5b99b25 1 d conv with cudnn (#9184)
5b99b25 is described below
commit 5b99b25e5f6ab3a20c7bcf4821a6af0a1a95f823
Author: Dick Carter <di...@comcast.net>
AuthorDate: Tue Jan 2 10:47:41 2018 -0800
1 d conv with cudnn (#9184)
* 1D conv/deconv handling by cudnn, with tests.
* Fix python3 test issue.
* Fix lint issues.
* Fixed CI and doc.
---
src/operator/nn/convolution-inl.h | 8 +-
src/operator/nn/convolution.cu | 7 -
src/operator/nn/cudnn/cudnn_convolution-inl.h | 273 +++++++++++-------------
src/operator/nn/cudnn/cudnn_deconvolution-inl.h | 261 +++++++++++-----------
src/operator/nn/deconvolution-inl.h | 161 ++++++++------
src/operator/nn/deconvolution.cc | 2 +-
src/operator/nn/deconvolution.cu | 8 +-
tests/python/gpu/test_operator_gpu.py | 64 ++++--
tests/python/unittest/test_operator.py | 221 ++++++++++++-------
9 files changed, 529 insertions(+), 476 deletions(-)
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index 38971ae..1613da6 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -67,13 +67,13 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
bool cudnn_off;
dmlc::optional<int> layout;
DMLC_DECLARE_PARAMETER(ConvolutionParam) {
- DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (h, w) or (d, h, w)");
+ DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (w,), (h, w) or (d, h, w)");
DMLC_DECLARE_FIELD(stride).set_default(TShape())
- .describe("Convolution stride: (h, w) or (d, h, w). Defaults to 1 for each dimension.");
+ .describe("Convolution stride: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
DMLC_DECLARE_FIELD(dilate).set_default(TShape())
- .describe("Convolution dilate: (h, w) or (d, h, w). Defaults to 1 for each dimension.");
+ .describe("Convolution dilate: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
DMLC_DECLARE_FIELD(pad).set_default(TShape())
- .describe("Zero pad for convolution: (h, w) or (d, h, w). Defaults to no padding.");
+ .describe("Zero pad for convolution: (w,), (h, w) or (d, h, w). Defaults to no padding.");
DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
.describe("Convolution filter(channel) number");
DMLC_DECLARE_FIELD(num_group).set_default(1)
diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu
index c31d78c..7234daf 100644
--- a/src/operator/nn/convolution.cu
+++ b/src/operator/nn/convolution.cu
@@ -41,13 +41,6 @@ Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
std::vector<TShape> *out_shape,
Context ctx) {
Operator *op = NULL;
- // If 1D convolution, use MXNet implementation
- if (param.kernel.ndim() == 1) {
- MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
- op = new ConvolutionOp<gpu, DType>(param);
- })
- return op;
- }
// depth wise conv
if (param.num_filter == param.num_group &&
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index f372039..8ffe97d 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -64,12 +64,22 @@ class CuDNNConvolutionOp : public Operator {
cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
#if CUDNN_MAJOR >= 5
- MSHADOW_LAYOUT_SWITCH(param_.layout.value(), Layout, {
+ auto effective_layout = param_.layout.value();
+ switch (effective_layout) {
+ // 1D convolutions will be executed as 2D convolutions with a height of 1.
+ case mshadow::kNCW: effective_layout = mshadow::kNCHW; break;
+ case mshadow::kNWC: effective_layout = mshadow::kNHWC; break;
+ case mshadow::kCWN: effective_layout = mshadow::kCHWN; break;
+ default: break;
+ }
+
+ MSHADOW_LAYOUT_SWITCH(effective_layout, Layout, {
format_ = LayoutType<Layout>::kCudnnFlag;
});
#else
- CHECK(param_.layout.value() == kNCHW || param_.layout.value() == kNCDHW)
- << "Need CuDNN > 5.0 for layout support";
+ CHECK(param_.layout.value() == kNCW ||
+ param_.layout.value() == kNCHW ||
+ param_.layout.value() == kNCDHW) << "Need CuDNN > 5.0 for layout support";
#endif
// Double check to make sure this class supports the operation
if (!Supports(param, forward_compute_type, backward_compute_type, ctx))
@@ -110,9 +120,6 @@ class CuDNNConvolutionOp : public Operator {
const std::vector<TBlob> &aux_args) {
using namespace mshadow;
size_t expected = param_.no_bias ? 2 : 3;
- DType *data_ptr = NULL;
- DType *wmat_ptr = NULL;
- DType *out_ptr = NULL;
CHECK_EQ(in_data.size(), expected);
CHECK_EQ(out_data.size(), 1U);
Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -120,27 +127,11 @@ class CuDNNConvolutionOp : public Operator {
Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, forward_workspace_byte_);
size_t workspace_size = TensorSizeBytes(workspace);
- if (param_.kernel.ndim() == 2) {
- Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
- CHECK_EQ(data.CheckContiguous(), true);
- CHECK_EQ(wmat.CheckContiguous(), true);
- CHECK_EQ(out.CheckContiguous(), true);
- data_ptr = data.dptr_;
- wmat_ptr = wmat.dptr_;
- out_ptr = out.dptr_;
- } else {
- Tensor<gpu, 5, DType> data = in_data[conv::kData].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> wmat = in_data[conv::kWeight].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> out = out_data[conv::kOut].get<gpu, 5, DType>(s);
- CHECK_EQ(data.CheckContiguous(), true);
- CHECK_EQ(wmat.CheckContiguous(), true);
- CHECK_EQ(out.CheckContiguous(), true);
- data_ptr = data.dptr_;
- wmat_ptr = wmat.dptr_;
- out_ptr = out.dptr_;
- }
+ // I/O's should have 2 more dims than the kernel dim
+ DType *data_ptr = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s);
+ DType *wmat_ptr = GetNdPtr(in_data[conv::kWeight], param_.kernel.ndim() + 2, s);
+ DType *out_ptr = GetNdPtr(out_data[conv::kOut], param_.kernel.ndim() + 2, s);
+
for (uint32_t g = 0; g < param_.num_group; ++g) {
typename DataType<DType>::ScaleType alpha = 1.0f;
typename DataType<DType>::ScaleType beta = 0.0f;
@@ -193,37 +184,17 @@ class CuDNNConvolutionOp : public Operator {
using namespace mshadow;
using namespace mshadow::expr;
size_t expected = param_.no_bias == 0 ? 3 : 2;
- DType *grad_ptr = NULL;
- DType *wmat_ptr = NULL;
- DType *gwmat_ptr = NULL;
- DType *data_ptr = NULL;
- DType *gdata_ptr = NULL;
CHECK_EQ(out_grad.size(), 1U);
CHECK(in_data.size() == expected && in_grad.size() == expected);
Stream<gpu> *s = ctx.get_stream<gpu>();
- if (param_.kernel.ndim() == 2) {
- Tensor<gpu, 4, DType> grad = out_grad[conv::kOut].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> gwmat = in_grad[conv::kWeight].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> gdata = in_grad[conv::kData].get<gpu, 4, DType>(s);
- grad_ptr = grad.dptr_;
- wmat_ptr = wmat.dptr_;
- gwmat_ptr = gwmat.dptr_;
- data_ptr = data.dptr_;
- gdata_ptr = gdata.dptr_;
- } else {
- Tensor<gpu, 5, DType> grad = out_grad[conv::kOut].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> wmat = in_data[conv::kWeight].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> gwmat = in_grad[conv::kWeight].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> data = in_data[conv::kData].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> gdata = in_grad[conv::kData].get<gpu, 5, DType>(s);
- grad_ptr = grad.dptr_;
- wmat_ptr = wmat.dptr_;
- gwmat_ptr = gwmat.dptr_;
- data_ptr = data.dptr_;
- gdata_ptr = gdata.dptr_;
- }
+
+ // I/O's should have 2 more dims than the kernel dim
+ DType *grad_ptr = GetNdPtr(out_grad[conv::kOut], param_.kernel.ndim() + 2, s);
+ DType *wmat_ptr = GetNdPtr(in_data[conv::kWeight], param_.kernel.ndim() + 2, s);
+ DType *gwmat_ptr = GetNdPtr(in_grad[conv::kWeight], param_.kernel.ndim() + 2, s);
+ DType *data_ptr = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s);
+ DType *gdata_ptr = GetNdPtr(in_grad[conv::kData], param_.kernel.ndim() + 2, s);
+
Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
size_t workspace_size = TensorSizeBytes(workspace);
for (uint32_t g = 0; g < param_.num_group; ++g) {
@@ -320,7 +291,8 @@ class CuDNNConvolutionOp : public Operator {
auto layout_val = param.layout.value();
auto true_fp16 = DataType<DType>::kFlag == kFloat16 &&
(forward_compute_type == kFloat16 || backward_compute_type == kFloat16);
- if (layout_val == kNDHWC || layout_val == kNHWC && true_fp16)
+ if (layout_val == kNDHWC || layout_val == kNWC ||
+ layout_val == kNHWC && true_fp16)
return false;
// Permits graceful fallback to pseudo-fp16 on heterogenous systems
@@ -374,100 +346,78 @@ class CuDNNConvolutionOp : public Operator {
TShape oshape = out_shape[conv::kOut];
TShape dstride, ostride;
wshape[0] /= param_.num_group;
- if (param_.kernel.ndim() == 2) {
- // 2d conv
-
+#if CUDNN_MAJOR <= 5
// As of cuDNN_v6, the unsuffixed version of cudnnSetConvolution2dDescriptor()
- // requires an additional 'computeType' parameter to set the precision of the
- // convolution calculation. This facility was available as of v5 in
- // cudnnSetConvolution2dDescriptor_v5(), but was never accessed.
- #if CUDNN_MAJOR >= 6
+ // takes an additional 'computeType' parameter to set the precision of the
+ // convolution calculation. Supply this method signature for cuDNN versions < 6.
+#define cudnnSetConvolution2dDescriptor(cdesc, p0, p1, s0, s1, d0, d1, m, ct) \
+ cudnnSetConvolution2dDescriptor(cdesc, p0, p1, s0, s1, d0, d1, m)
+#endif
+ if (param_.kernel.ndim() == 1 || param_.kernel.ndim() == 2) {
+ // 1d or 2d conv
+ auto pad = param_.kernel.ndim() == 2 ? param_.pad : TShape({0, param_.pad[0]});
+ auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
+ auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
CUDNN_CALL(cudnnSetConvolution2dDescriptor(forward_conv_desc_,
- param_.pad[0],
- param_.pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
+ pad[0],
+ pad[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1],
CUDNN_CROSS_CORRELATION,
cudnn_forward_compute_type));
CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_,
- param_.pad[0],
- param_.pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
+ pad[0],
+ pad[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1],
CUDNN_CROSS_CORRELATION,
cudnn_backward_compute_type));
CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_w_,
- param_.pad[0],
- param_.pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
+ pad[0],
+ pad[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1],
CUDNN_CROSS_CORRELATION,
cudnn_backward_compute_type));
- #else
- CUDNN_CALL(cudnnSetConvolution2dDescriptor(forward_conv_desc_,
- param_.pad[0],
- param_.pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
- CUDNN_CROSS_CORRELATION));
- CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_,
- param_.pad[0],
- param_.pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
- CUDNN_CROSS_CORRELATION));
- CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_w_,
- param_.pad[0],
- param_.pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
- CUDNN_CROSS_CORRELATION));
- #endif
-
- #if CUDNN_MAJOR >= 5
- wshape = ConvertLayout(wshape.get<4>(), param_.layout.value(), kNCHW);
- CUDNN_CALL(cudnnSetFilter4dDescriptor(filter_desc_,
- dtype_,
- format_,
- wshape[0],
- wshape[1],
- wshape[2],
- wshape[3]));
- #else
- CHECK_EQ(param_.layout.value(), kNCHW) << "CuDNN V4 only support NCHW layout";
+#if CUDNN_MAJOR < 5
+ // As of cuDNN_v5, cudnnSetFilter4dDescriptor() takes a format parameter.
+ // Supply this method signature for cuDNN versions < 5.
+#define cudnnSetFilter4dDescriptor(fdesc, dt, f, w0, w1, w2, w3) \
+ cudnnSetFilter4dDescriptor(fdesc, dt, w0, w1, w2, w3)
+ CHECK_EQ(format_, CUDNN_TENSOR_NCHW) << "CuDNN V4 and earlier only supports NCHW layout";
+#endif
+ if (param_.kernel.ndim() == 2) {
+ wshape = ConvertLayout(wshape.get<4>(), param_.layout.value(), kNCHW);
+ dstride = ConvertLayout(Strides<4>(dshape), param_.layout.value(), kNCHW);
+ dshape = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+ ostride = ConvertLayout(Strides<4>(oshape), param_.layout.value(), kNCHW);
+ oshape = ConvertLayout(oshape.get<4>(), param_.layout.value(), kNCHW);
+ } else {
+ wshape = ConvertLayout(wshape.get<3>(), param_.layout.value(), kNCW);
+ wshape = TShape({wshape[0], wshape[1], 1, wshape[2]});
+ dstride = ConvertLayout(Strides<3>(dshape), param_.layout.value(), kNCW);
+ dstride = TShape({dstride[0], dstride[1], dstride[1], dstride[2]});
+ dshape = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
+ dshape = TShape({dshape[0], dshape[1], 1, dshape[2]});
+ ostride = ConvertLayout(Strides<3>(oshape), param_.layout.value(), kNCW);
+ ostride = TShape({ostride[0], ostride[1], ostride[1], ostride[2]});
+ oshape = ConvertLayout(oshape.get<3>(), param_.layout.value(), kNCW);
+ oshape = TShape({oshape[0], oshape[1], 1, oshape[2]});
+ }
CUDNN_CALL(cudnnSetFilter4dDescriptor(filter_desc_,
- dtype_,
- wshape[0],
- wshape[1],
- wshape[2],
- wshape[3]));
- #endif
+ dtype_,
+ format_,
+ wshape[0],
+ wshape[1],
+ wshape[2],
+ wshape[3]));
- dstride = ConvertLayout(Shape4(dshape[1] * dshape[2] * dshape[3],
- dshape[2] * dshape[3],
- dshape[3],
- 1),
- param_.layout.value(), kNCHW);
- dshape = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
-
- ostride = ConvertLayout(Shape4(oshape[1] * oshape[2] * oshape[3],
- oshape[2] * oshape[3],
- oshape[3],
- 1),
- param_.layout.value(), kNCHW);
- oshape = ConvertLayout(oshape.get<4>(), param_.layout.value(), kNCHW);
} else if (param_.kernel.ndim() == 3) {
// 3d conv
#if CUDNN_MAJOR >= 5
@@ -505,20 +455,9 @@ class CuDNNConvolutionOp : public Operator {
CUDNN_CROSS_CORRELATION,
cudnn_backward_compute_type));
- dstride = ConvertLayout(Shape5(dshape[1] * dshape[2] * dshape[3] * dshape[4],
- dshape[2] * dshape[3] * dshape[4],
- dshape[3] * dshape[4],
- dshape[4],
- 1),
- param_.layout.value(), kNCDHW);
+ dstride = ConvertLayout(Strides<5>(dshape), param_.layout.value(), kNCDHW);
dshape = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
-
- ostride = ConvertLayout(Shape5(oshape[1] * oshape[2] * oshape[3] * oshape[4],
- oshape[2] * oshape[3] * oshape[4],
- oshape[3] * oshape[4],
- oshape[4],
- 1),
- param_.layout.value(), kNCDHW);
+ ostride = ConvertLayout(Strides<5>(oshape), param_.layout.value(), kNCDHW);
oshape = ConvertLayout(oshape.get<5>(), param_.layout.value(), kNCDHW);
}
// Set "allow tensor core" flag in convolution descriptors, if available.
@@ -852,6 +791,38 @@ class CuDNNConvolutionOp : public Operator {
return buffer->data();
}
+ // Converts a TBlob to a dptr, checking for the expected dim and that it's contiguous.
+ DType *GetNdPtr(const TBlob& tb, int dim, Stream<gpu> *s) {
+ DType *data_ptr = NULL;
+ if (dim == 3) {
+ Tensor<gpu, 3, DType> data = tb.get<gpu, 3, DType>(s);
+ CHECK_EQ(data.CheckContiguous(), true);
+ data_ptr = data.dptr_;
+ } else if (dim == 4) {
+ Tensor<gpu, 4, DType> data = tb.get<gpu, 4, DType>(s);
+ CHECK_EQ(data.CheckContiguous(), true);
+ data_ptr = data.dptr_;
+ } else if (dim == 5) {
+ Tensor<gpu, 5, DType> data = tb.get<gpu, 5, DType>(s);
+ CHECK_EQ(data.CheckContiguous(), true);
+ data_ptr = data.dptr_;
+ } else {
+ LOG(FATAL) << "Unexpected Tensor size " << dim << ", supporting only 3, 4 or 5.";
+ }
+ return data_ptr;
+ }
+
+ // Converts a TShape to a Shape<> of strides.
+ // e.g. {shape[0], shape[1], shape[2]} -> {shape[1]*shape[2], shape[2], 1}
+ template <int dim>
+ inline Shape<dim> Strides(const TShape &s) {
+ uint32_t ndim = s.ndim();
+ TShape strides(ndim);
+ for (uint32_t i = 0; i != ndim; ++i)
+ strides[i] = s.ProdShape(i+1, ndim);
+ return strides.get<dim>();
+ }
+
void InitBufferForParam() {
CastTShapeToIntPtr(param_.stride, ¶m_stride_);
CastTShapeToIntPtr(param_.dilate, ¶m_dilate_);
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index 09e89c2..bc02d1b 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -61,16 +61,26 @@ class CuDNNDeconvolutionOp : public Operator {
cudnn_tensor_core_ = DataType<DType>::kFlag == kFloat16 && GetEnvAllowTensorCore();
#if CUDNN_MAJOR >= 5
- MSHADOW_LAYOUT_SWITCH(param_.layout.value(), Layout, {
+ auto effective_layout = param_.layout.value();
+ switch (effective_layout) {
+ // 1D convolutions will be executed as 2D convolutions with a height of 1.
+ case mshadow::kNCW: effective_layout = mshadow::kNCHW; break;
+ case mshadow::kNWC: effective_layout = mshadow::kNHWC; break;
+ case mshadow::kCWN: effective_layout = mshadow::kCHWN; break;
+ default: break;
+ }
+
+ MSHADOW_LAYOUT_SWITCH(effective_layout, Layout, {
format_ = LayoutType<Layout>::kCudnnFlag;
});
#else
- CHECK(param_.layout.value() == kNCHW || param_.layout.value() == kNCDHW)
- << "Need CuDNN > 5.0 for layout support";
+ CHECK(param_.layout.value() == kNCW ||
+ param_.layout.value() == kNCHW ||
+ param_.layout.value() == kNCDHW) << "Need CuDNN > 5.0 for layout support";
#endif
// Double check to make sure this class supports the operation
if (!Supports(param, forward_compute_type, backward_compute_type, ctx))
- LOG(FATAL) << "Need CuDNN >= 6.0 for dilated convolution.";
+ LOG(FATAL) << "Need CuDNN >= 6.0 for dilated deconvolution.";
InitDescriptors(ctx, in_shape, out_shape,
cudnn_forward_compute_type, cudnn_backward_compute_type);
@@ -107,9 +117,6 @@ class CuDNNDeconvolutionOp : public Operator {
const std::vector<TBlob> &aux_args) {
using namespace mshadow;
size_t expected = param_.no_bias ? 2 : 3;
- DType *data_ptr = NULL;
- DType *wmat_ptr = NULL;
- DType *out_ptr = NULL;
CHECK_EQ(in_data.size(), expected);
CHECK_EQ(out_data.size(), 1U);
Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -117,27 +124,10 @@ class CuDNNDeconvolutionOp : public Operator {
Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, forward_workspace_byte_);
size_t workspace_size = TensorSizeBytes(workspace);
- if (param_.kernel.ndim() == 2) {
- Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
- CHECK_EQ(data.CheckContiguous(), true);
- CHECK_EQ(wmat.CheckContiguous(), true);
- CHECK_EQ(out.CheckContiguous(), true);
- data_ptr = data.dptr_;
- wmat_ptr = wmat.dptr_;
- out_ptr = out.dptr_;
- } else {
- Tensor<gpu, 5, DType> data = in_data[deconv::kData].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> wmat = in_data[deconv::kWeight].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> out = out_data[deconv::kOut].get<gpu, 5, DType>(s);
- CHECK_EQ(data.CheckContiguous(), true);
- CHECK_EQ(wmat.CheckContiguous(), true);
- CHECK_EQ(out.CheckContiguous(), true);
- data_ptr = data.dptr_;
- wmat_ptr = wmat.dptr_;
- out_ptr = out.dptr_;
- }
+ // I/O's should have 2 more dims than the kernel dim
+ DType *data_ptr = GetNdPtr(in_data[deconv::kData], param_.kernel.ndim() + 2, s);
+ DType *wmat_ptr = GetNdPtr(in_data[deconv::kWeight], param_.kernel.ndim() + 2, s);
+ DType *out_ptr = GetNdPtr(out_data[deconv::kOut], param_.kernel.ndim() + 2, s);
for (uint32_t g = 0; g < param_.num_group; ++g) {
typename DataType<DType>::ScaleType alpha = 1.0f;
@@ -207,37 +197,17 @@ class CuDNNDeconvolutionOp : public Operator {
using namespace mshadow;
using namespace mshadow::expr;
size_t expected = param_.no_bias == 0 ? 3 : 2;
- DType *grad_ptr = NULL;
- DType *wmat_ptr = NULL;
- DType *gwmat_ptr = NULL;
- DType *data_ptr = NULL;
- DType *gdata_ptr = NULL;
CHECK_EQ(out_grad.size(), 1U);
CHECK(in_data.size() == expected && in_grad.size() == expected);
Stream<gpu> *s = ctx.get_stream<gpu>();
- if (param_.kernel.ndim() == 2) {
- Tensor<gpu, 4, DType> grad = out_grad[deconv::kOut].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> gwmat = in_grad[deconv::kWeight].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
- Tensor<gpu, 4, DType> gdata = in_grad[deconv::kData].get<gpu, 4, DType>(s);
- grad_ptr = grad.dptr_;
- wmat_ptr = wmat.dptr_;
- gwmat_ptr = gwmat.dptr_;
- data_ptr = data.dptr_;
- gdata_ptr = gdata.dptr_;
- } else {
- Tensor<gpu, 5, DType> grad = out_grad[deconv::kOut].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> wmat = in_data[deconv::kWeight].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> gwmat = in_grad[deconv::kWeight].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> data = in_data[deconv::kData].get<gpu, 5, DType>(s);
- Tensor<gpu, 5, DType> gdata = in_grad[deconv::kData].get<gpu, 5, DType>(s);
- grad_ptr = grad.dptr_;
- wmat_ptr = wmat.dptr_;
- gwmat_ptr = gwmat.dptr_;
- data_ptr = data.dptr_;
- gdata_ptr = gdata.dptr_;
- }
+
+ // I/O's should have 2 more dims than the kernel dim
+ DType *grad_ptr = GetNdPtr(out_grad[deconv::kOut], param_.kernel.ndim() + 2, s);
+ DType *wmat_ptr = GetNdPtr(in_data[deconv::kWeight], param_.kernel.ndim() + 2, s);
+ DType *gwmat_ptr = GetNdPtr(in_grad[deconv::kWeight], param_.kernel.ndim() + 2, s);
+ DType *data_ptr = GetNdPtr(in_data[deconv::kData], param_.kernel.ndim() + 2, s);
+ DType *gdata_ptr = GetNdPtr(in_grad[deconv::kData], param_.kernel.ndim() + 2, s);
+
CHECK_NE(req[deconv::kWeight], kWriteInplace);
if (!param_.no_bias) {
CHECK_NE(req[deconv::kBias], kWriteInplace);
@@ -331,7 +301,8 @@ class CuDNNDeconvolutionOp : public Operator {
auto layout_val = param.layout.value();
auto true_fp16 = DataType<DType>::kFlag == kFloat16 &&
(forward_compute_type == kFloat16 || backward_compute_type == kFloat16);
- if (layout_val == kNDHWC || layout_val == kNHWC && true_fp16)
+ if (layout_val == kNDHWC || layout_val == kNWC ||
+ layout_val == kNHWC && true_fp16)
return false;
// Permits graceful fallback to pseudo-fp16 on heterogenous systems
@@ -374,9 +345,6 @@ class CuDNNDeconvolutionOp : public Operator {
cudnnDataType_t cudnn_forward_compute_type,
cudnnDataType_t cudnn_backward_compute_type) {
using namespace mshadow;
- #if CUDNN_MAJOR >= 5
- format_ = CUDNN_TENSOR_NCHW;
- #endif
size_t expected = param_.no_bias ? 2 : 3;
CHECK_EQ(in_shape.size(), expected);
CHECK_EQ(out_shape.size(), 1U);
@@ -393,70 +361,81 @@ class CuDNNDeconvolutionOp : public Operator {
TShape oshape = out_shape[deconv::kOut];
TShape dstride, ostride;
wshape[0] /= param_.num_group;
-
- if (param_.kernel.ndim() == 2) {
- // 2d conv
+#if CUDNN_MAJOR <= 5
+ // As of cuDNN_v6, the unsuffixed version of cudnnSetConvolution2dDescriptor()
+ // takes an additional 'computeType' parameter to set the precision of the
+ // convolution calculation. Supply this method signature for cuDNN versions < 6.
+#define cudnnSetConvolution2dDescriptor(cdesc, p0, p1, s0, s1, d0, d1, m, ct) \
+ cudnnSetConvolution2dDescriptor(cdesc, p0, p1, s0, s1, d0, d1, m)
+#endif
+ if (param_.kernel.ndim() == 1 || param_.kernel.ndim() == 2) {
+ // 1d or 2d conv
index_t o_pad[2];
index_t o_adj[2];
- param_.InferPad(dshape, o_pad, o_adj);
+ if (param_.kernel.ndim() == 2) {
+ param_.InferPad(dshape, o_pad, o_adj);
+ } else {
+ index_t o_pad_1D[1];
+ index_t o_adj_1D[1];
+ param_.InferPad(dshape, o_pad_1D, o_adj_1D);
+ o_pad[0] = 0;
+ o_pad[1] = o_pad_1D[0];
+ }
+ auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
+ auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
- #if CUDNN_MAJOR >= 6
CUDNN_CALL(cudnnSetConvolution2dDescriptor(forward_conv_desc_,
o_pad[0],
o_pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1],
CUDNN_CROSS_CORRELATION,
cudnn_forward_compute_type));
CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_,
o_pad[0],
o_pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1],
CUDNN_CROSS_CORRELATION,
cudnn_backward_compute_type));
CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_w_,
o_pad[0],
o_pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1],
CUDNN_CROSS_CORRELATION,
cudnn_backward_compute_type));
- #else
- CUDNN_CALL(cudnnSetConvolution2dDescriptor(forward_conv_desc_,
- o_pad[0],
- o_pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
- CUDNN_CROSS_CORRELATION));
- CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_,
- o_pad[0],
- o_pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
- CUDNN_CROSS_CORRELATION));
- CUDNN_CALL(cudnnSetConvolution2dDescriptor(back_conv_desc_w_,
- o_pad[0],
- o_pad[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1],
- CUDNN_CROSS_CORRELATION));
- #endif
-
- #if CUDNN_MAJOR >= 5
- wshape = ConvertLayout(wshape.get<4>(), param_.layout.value(), kNCHW);
+#if CUDNN_MAJOR < 5
+ // As of cuDNN_v5, cudnnSetFilter4dDescriptor() takes a format parameter.
+ // Supply this method signature for cuDNN versions < 5.
+#define cudnnSetFilter4dDescriptor(fdesc, dt, f, w0, w1, w2, w3) \
+ cudnnSetFilter4dDescriptor(fdesc, dt, w0, w1, w2, w3)
+ CHECK_EQ(format_, CUDNN_TENSOR_NCHW) << "CuDNN V4 and earlier only supports NCHW layout";
+#endif
+ if (param_.kernel.ndim() == 2) {
+ wshape = ConvertLayout(wshape.get<4>(), param_.layout.value(), kNCHW);
+ dstride = ConvertLayout(Strides<4>(dshape), param_.layout.value(), kNCHW);
+ dshape = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+ ostride = ConvertLayout(Strides<4>(oshape), param_.layout.value(), kNCHW);
+ oshape = ConvertLayout(oshape.get<4>(), param_.layout.value(), kNCHW);
+ } else {
+ wshape = ConvertLayout(wshape.get<3>(), param_.layout.value(), kNCW);
+ wshape = TShape({wshape[0], wshape[1], 1, wshape[2]});
+ dstride = ConvertLayout(Strides<3>(dshape), param_.layout.value(), kNCW);
+ dstride = TShape({dstride[0], dstride[1], dstride[1], dstride[2]});
+ dshape = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
+ dshape = TShape({dshape[0], dshape[1], 1, dshape[2]});
+ ostride = ConvertLayout(Strides<3>(oshape), param_.layout.value(), kNCW);
+ ostride = TShape({ostride[0], ostride[1], ostride[1], ostride[2]});
+ oshape = ConvertLayout(oshape.get<3>(), param_.layout.value(), kNCW);
+ oshape = TShape({oshape[0], oshape[1], 1, oshape[2]});
+ }
CUDNN_CALL(cudnnSetFilter4dDescriptor(filter_desc_,
dtype_,
format_,
@@ -464,29 +443,6 @@ class CuDNNDeconvolutionOp : public Operator {
wshape[1],
wshape[2],
wshape[3]));
- #else
- CHECK_EQ(param_.layout.value(), kNCHW) << "CuDNN V4 only support NCHW layout";
- CUDNN_CALL(cudnnSetFilter4dDescriptor(filter_desc_,
- dtype_,
- wshape[0],
- wshape[1],
- wshape[2],
- wshape[3]));
- #endif
-
- dstride = ConvertLayout(Shape4(dshape[1] * dshape[2] * dshape[3],
- dshape[2] * dshape[3],
- dshape[3],
- 1),
- param_.layout.value(), kNCHW);
- dshape = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
-
- ostride = ConvertLayout(Shape4(oshape[1] * oshape[2] * oshape[3],
- oshape[2] * oshape[3],
- oshape[3],
- 1),
- param_.layout.value(), kNCHW);
- oshape = ConvertLayout(oshape.get<4>(), param_.layout.value(), kNCHW);
} else if (param_.kernel.ndim() == 3) {
// 3d conv
index_t o_pad[3];
@@ -528,20 +484,9 @@ class CuDNNDeconvolutionOp : public Operator {
CUDNN_CROSS_CORRELATION,
cudnn_backward_compute_type));
- dstride = ConvertLayout(Shape5(dshape[1] * dshape[2] * dshape[3] * dshape[4],
- dshape[2] * dshape[3] * dshape[4],
- dshape[3] * dshape[4],
- dshape[4],
- 1),
- param_.layout.value(), kNCDHW);
+ dstride = ConvertLayout(Strides<5>(dshape), param_.layout.value(), kNCDHW);
dshape = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
-
- ostride = ConvertLayout(Shape5(oshape[1] * oshape[2] * oshape[3] * oshape[4],
- oshape[2] * oshape[3] * oshape[4],
- oshape[3] * oshape[4],
- oshape[4],
- 1),
- param_.layout.value(), kNCDHW);
+ ostride = ConvertLayout(Strides<5>(oshape), param_.layout.value(), kNCDHW);
oshape = ConvertLayout(oshape.get<5>(), param_.layout.value(), kNCDHW);
}
// Set "allow tensor core" flag in convolution descriptors, if available.
@@ -883,6 +828,38 @@ class CuDNNDeconvolutionOp : public Operator {
return buffer->data();
}
+ // Converts a TBlob to a dptr, checking for the expected dim and that it's contiguous.
+ DType *GetNdPtr(const TBlob& tb, int dim, Stream<gpu> *s) {
+ DType *data_ptr = NULL;
+ if (dim == 3) {
+ Tensor<gpu, 3, DType> data = tb.get<gpu, 3, DType>(s);
+ CHECK_EQ(data.CheckContiguous(), true);
+ data_ptr = data.dptr_;
+ } else if (dim == 4) {
+ Tensor<gpu, 4, DType> data = tb.get<gpu, 4, DType>(s);
+ CHECK_EQ(data.CheckContiguous(), true);
+ data_ptr = data.dptr_;
+ } else if (dim == 5) {
+ Tensor<gpu, 5, DType> data = tb.get<gpu, 5, DType>(s);
+ CHECK_EQ(data.CheckContiguous(), true);
+ data_ptr = data.dptr_;
+ } else {
+ LOG(FATAL) << "Unexpected Tensor size " << dim << ", supporting only 3, 4 or 5.";
+ }
+ return data_ptr;
+ }
+
+ // Converts a TShape to a Shape<> of strides.
+ // e.g. {shape[0], shape[1], shape[2]} -> {shape[1]*shape[2], shape[2], 1}
+ template <int dim>
+ inline Shape<dim> Strides(const TShape &s) {
+ uint32_t ndim = s.ndim();
+ TShape strides(ndim);
+ for (uint32_t i = 0; i != ndim; ++i)
+ strides[i] = s.ProdShape(i+1, ndim);
+ return strides.get<dim>();
+ }
+
void InitBufferForParam() {
CastTShapeToIntPtr(param_.stride, ¶m_stride_);
CastTShapeToIntPtr(param_.dilate, ¶m_dilate_);
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index b7d2676..fbdfaa8 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -63,28 +63,28 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
bool cudnn_off;
dmlc::optional<int> layout;
DMLC_DECLARE_PARAMETER(DeconvolutionParam) {
- DMLC_DECLARE_FIELD(kernel).describe("Deconvolution kernel size: (h, w) or (d, h, w). "
+ DMLC_DECLARE_FIELD(kernel).describe("Deconvolution kernel size: (w,), (h, w) or (d, h, w). "
"This is same as the kernel size used for the corresponding convolution");
DMLC_DECLARE_FIELD(stride).set_default(TShape())
- .describe("The stride used for the corresponding convolution: (h, w) or (d, h, w). "
+ .describe("The stride used for the corresponding convolution: (w,), (h, w) or (d, h, w). "
"Defaults to 1 for each dimension.");
DMLC_DECLARE_FIELD(dilate).set_default(TShape())
- .describe("Dilation factor for each dimension of the input: (h, w) or (d, h, w). "
+ .describe("Dilation factor for each dimension of the input: (w,), (h, w) or (d, h, w). "
"Defaults to 1 for each dimension.");
DMLC_DECLARE_FIELD(pad).set_default(TShape())
.describe("The amount of implicit zero padding added during convolution for each "
"dimension of the input: "
- "(h, w) or (d, h, w). "
+ "(w,), (h, w) or (d, h, w). "
"``(kernel-1)/2`` is usually a good choice. "
"If `target_shape` is set, "
"`pad` will be ignored and a padding that will generate the target shape "
"will be used. Defaults to no padding.");
DMLC_DECLARE_FIELD(adj).set_default(TShape())
- .describe("Adjustment for output shape: (h, w) or (d, h, w). "
+ .describe("Adjustment for output shape: (w,), (h, w) or (d, h, w). "
"If `target_shape` is set, "
"`adj` will be ignored and computed accordingly.");
DMLC_DECLARE_FIELD(target_shape).set_default(TShape())
- .describe("Shape of the output tensor: (h, w) or (d, h, w).");
+ .describe("Shape of the output tensor: (w,), (h, w) or (d, h, w).");
DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
.describe("Number of output filters.");
DMLC_DECLARE_FIELD(num_group).set_default(1)
@@ -211,8 +211,8 @@ class DeconvolutionOp : public Operator {
using namespace mshadow;
using namespace mshadow::expr;
- if (param_.kernel.ndim() != 2) {
- LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported";
+ if (param_.kernel.ndim() > 2) {
+ LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported";
}
CHECK_EQ(req[deconv::kOut], kWriteTo);
@@ -220,18 +220,29 @@ class DeconvolutionOp : public Operator {
CHECK_EQ(in_data.size(), expected);
CHECK_EQ(out_data.size(), 1U);
Stream<xpu> *s = ctx.get_stream<xpu>();
- Tensor<xpu, 4, DType> data = in_data[deconv::kData].get<xpu, 4, DType>(s);
- Tensor<xpu, 4, DType> out = out_data[deconv::kOut].get<xpu, 4, DType>(s);
-
+ auto in_data_shape = in_data[deconv::kData].shape_;
+ Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
+ Tensor<xpu, 4, DType> out = TBlobTo4DTensor(out_data[deconv::kOut], s);
index_t o_pad[2], o_adj[2];
- TShape dshape = {static_cast<nnvm::dim_t>(data.size(2)),
- static_cast<nnvm::dim_t>(data.size(3))};
- param_.InferPad(dshape, o_pad, o_adj);
+ if (param_.kernel.ndim() == 2) {
+ param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
+ } else {
+ index_t o_pad_1D[1], o_adj_1D[1];
+ param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D);
+ o_pad[0] = 0;
+ o_pad[1] = o_pad_1D[0];
+ o_adj[0] = 0;
+ o_adj[1] = o_adj_1D[0];
+ }
+ auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
+ auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
+ auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
+ auto kernel_size = kernel.Size();
Shape<3> wmat_shape =
Shape3(param_.num_group,
data.shape_[1] / param_.num_group,
- param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+ param_.num_filter / param_.num_group * kernel_size);
Tensor<xpu, 3, DType> wmat =
in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
#if defined(__CUDACC__)
@@ -256,21 +267,21 @@ class DeconvolutionOp : public Operator {
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
if (o_pad[0] == 0 && o_pad[1] == 0) {
temp_col = unpack_patch2col(out.Slice(i, i + step),
- param_.kernel[0],
- param_.kernel[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1]);
+ kernel[0],
+ kernel[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1]);
} else {
temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
o_pad[0], o_pad[1]),
- param_.kernel[0],
- param_.kernel[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1]);
+ kernel[0],
+ kernel[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1]);
}
const index_t gstride = temp_col.size(0) / param_.num_group;
for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
@@ -283,24 +294,24 @@ class DeconvolutionOp : public Operator {
if (o_pad[0] == 0 && o_pad[1] == 0) {
out.Slice(i, i + step) = pack_col2patch(temp_col,
out.Slice(i, i + step).shape_,
- param_.kernel[0],
- param_.kernel[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1]);
+ kernel[0],
+ kernel[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1]);
} else {
Shape<4> pshape = out.Slice(i, i + step).shape_;
pshape[2] += 2 * o_pad[0];
pshape[3] += 2 * o_pad[1];
out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
pshape,
- param_.kernel[0],
- param_.kernel[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1]),
+ kernel[0],
+ kernel[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1]),
out[i][0].shape_);
}
}
@@ -328,13 +339,31 @@ class DeconvolutionOp : public Operator {
CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true);
// get data
Stream<xpu> *s = ctx.get_stream<xpu>();
- Tensor<xpu, 4, DType> data = in_data[deconv::kData].get<xpu, 4, DType>(s);
- Tensor<xpu, 4, DType> grad = out_grad[deconv::kOut].get<xpu, 4, DType>(s);
- Tensor<xpu, 4, DType> gdata = in_grad[deconv::kData].get<xpu, 4, DType>(s);
+ auto in_data_shape = in_data[deconv::kData].shape_;
+ Tensor<xpu, 4, DType> data = TBlobTo4DTensor(in_data[deconv::kData], s);
+ Tensor<xpu, 4, DType> grad = TBlobTo4DTensor(out_grad[deconv::kOut], s);
+ Tensor<xpu, 4, DType> gdata = TBlobTo4DTensor(in_grad[deconv::kData], s);
+
+ index_t o_pad[2], o_adj[2];
+ if (param_.kernel.ndim() == 2) {
+ param_.InferPad(TShape({in_data_shape[2], in_data_shape[3]}), o_pad, o_adj);
+ } else {
+ index_t o_pad_1D[1], o_adj_1D[1];
+ param_.InferPad({in_data_shape[2]}, o_pad_1D, o_adj_1D);
+ o_pad[0] = 0;
+ o_pad[1] = o_pad_1D[0];
+ o_adj[0] = 0;
+ o_adj[1] = o_adj_1D[0];
+ }
+ auto stride = param_.kernel.ndim() == 2 ? param_.stride : TShape({1, param_.stride[0]});
+ auto dilate = param_.kernel.ndim() == 2 ? param_.dilate : TShape({1, param_.dilate[0]});
+ auto kernel = param_.kernel.ndim() == 2 ? param_.kernel : TShape({1, param_.kernel[0]});
+ auto kernel_size = kernel.Size();
+
Shape<3> wmat_shape =
Shape3(param_.num_group,
data.shape_[1] / param_.num_group,
- param_.num_filter / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+ param_.num_filter / param_.num_group * kernel_size);
Tensor<xpu, 3, DType> wmat =
in_data[deconv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
Tensor<xpu, 3, DType> gwmat =
@@ -343,10 +372,6 @@ class DeconvolutionOp : public Operator {
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
<< "Must init CuBLAS handle in stream";
#endif
- index_t o_pad[2], o_adj[2];
- TShape dshape = {static_cast<nnvm::dim_t>(data.size(2)),
- static_cast<nnvm::dim_t>(data.size(3))};
- param_.InferPad(dshape, o_pad, o_adj);
const index_t nbatch = data.size(0);
Tensor<xpu, 1, DType> workspace =
@@ -366,20 +391,20 @@ class DeconvolutionOp : public Operator {
temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
if (o_pad[0] == 0 && o_pad[1] == 0) {
temp_col = unpack_patch2col(grad.Slice(i, i + step),
- param_.kernel[0],
- param_.kernel[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1]);
+ kernel[0],
+ kernel[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1]);
} else {
temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), o_pad[0], o_pad[1]),
- param_.kernel[0],
- param_.kernel[1],
- param_.stride[0],
- param_.stride[1],
- param_.dilate[0],
- param_.dilate[1]);
+ kernel[0],
+ kernel[1],
+ stride[0],
+ stride[1],
+ dilate[0],
+ dilate[1]);
}
const index_t gstride = temp_col.size(0) / param_.num_group;
for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
@@ -422,9 +447,8 @@ class DeconvolutionOp : public Operator {
private:
inline index_t InitTemp(const mshadow::Shape<4> &ishape,
const mshadow::Shape<4> &oshape) {
- const int ksize_y = param_.kernel[0];
- const int ksize_x = param_.kernel[1];
- shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
+ const int ksize = param_.kernel.Size();
+ shape_colunit_ = mshadow::Shape2(ishape[1] * ksize,
oshape[2] * oshape[3]);
shape_dstunit_ = mshadow::Shape3(param_.num_group,
oshape[1] / param_.num_group,
@@ -449,6 +473,15 @@ class DeconvolutionOp : public Operator {
return required_size;
}
+ inline Tensor<xpu, 4, DType> TBlobTo4DTensor(const TBlob &tb, Stream<xpu> *s) {
+ using namespace mshadow;
+ if (param_.kernel.ndim() == 2)
+ return tb.get<xpu, 4, DType>(s);
+ else
+ return tb.get_with_shape<xpu, 4, DType>(
+ Shape4(tb.shape_[0], tb.shape_[1], 1, tb.shape_[2]), s);
+ }
+
DeconvolutionParam param_;
mshadow::Shape<2> shape_colunit_;
mshadow::Shape<3> shape_dstunit_;
@@ -505,8 +538,8 @@ class DeconvolutionProp : public OperatorProperty {
std::vector<TShape> *out_shape,
std::vector<TShape> *aux_shape) const override {
#if MXNET_USE_CUDNN == 0
- if (param_.kernel.ndim() != 2) {
- LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported";
+ if (param_.kernel.ndim() > 2) {
+ LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported";
return false;
}
#endif // CUDNN
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 45867f7..9d3c040 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -55,7 +55,7 @@ MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp)
.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution "
"operation.")
.add_arguments(DeconvolutionParam::__FIELDS__())
-.describe("Computes 2D transposed convolution (aka fractionally strided convolution) of the "
+.describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the "
"input tensor. This operation can be seen as the gradient of Convolution operation with "
"respect to its input. Convolution usually reduces the size of the input. Transposed "
"convolution works the other way, going from a smaller input to a larger output while "
diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu
index 6d07876..6237701 100644
--- a/src/operator/nn/deconvolution.cu
+++ b/src/operator/nn/deconvolution.cu
@@ -38,13 +38,7 @@ Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype,
Context ctx) {
// Logic here parallels that in Convolution.cu
Operator *op = NULL;
- // If 1D deconvolution, use MXNet implementation
- if (param.kernel.ndim() == 1) {
- MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
- op = new DeconvolutionOp<gpu, DType>(param);
- })
- return op;
- }
+
#if MXNET_USE_CUDNN == 1
// On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 7706bce..31e888b 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -460,19 +460,19 @@ def test_convolution_options():
{'ctx': mx.cpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float64}},
{'ctx': mx.cpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float32}}]
# Pad > 0
- sym = mx.sym.Convolution(num_filter=3, kernel=(3,), pad=(1,), name='conv')
+ sym = mx.sym.Convolution(layout='NCW', num_filter=3, kernel=(3,), pad=(1,), name='conv')
sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,), pad=(1,), cudnn_off=True, name='conv')
check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
# Stride > 1
- sym = mx.sym.Convolution(num_filter=3, kernel=(3,), stride=(2,), name='conv')
+ sym = mx.sym.Convolution(layout='NCW', num_filter=3, kernel=(3,), stride=(2,), name='conv')
sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,), stride=(2,), cudnn_off=True, name='conv')
check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
# Dilate > 1
- sym = mx.sym.Convolution(num_filter=3, kernel=(3,), dilate=(2,), name='conv')
+ sym = mx.sym.Convolution(layout='NCW', num_filter=3, kernel=(3,), dilate=(2,), name='conv')
sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,), dilate=(2,), cudnn_off=True, name='conv')
check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
# 1x1 convolution
- sym = mx.sym.Convolution(num_filter=3, kernel=(1,), pad=(0,), name='conv')
+ sym = mx.sym.Convolution(layout='NCW', num_filter=3, kernel=(1,), pad=(0,), name='conv')
sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(1,), pad=(0,), cudnn_off=True, name='conv')
check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
@@ -558,6 +558,24 @@ def test_pooling_with_type():
check_consistency(sym, ctx_list)
def test_deconvolution_with_type():
+ # Test basic deconvolution without exercising stride, pad or dilation.
+ # 1D deconvolution
+ sym = mx.sym.Deconvolution(num_filter=3, kernel=(3,), name='deconv')
+ ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float64}},
+ {'ctx': mx.gpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float32}},
+ {'ctx': mx.gpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float16}},
+ {'ctx': mx.cpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float64}},
+ {'ctx': mx.cpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float32}}]
+ # wider tolerance needed for true-fp16 test above
+ tol = {np.dtype(np.float16): 0.3,
+ np.dtype(np.float32): 1e-3,
+ np.dtype(np.float64): 1e-5,
+ np.dtype(np.uint8): 0,
+ np.dtype(np.int32): 0}
+ check_consistency(sym, ctx_list, tol=tol)
+ check_consistency(sym, ctx_list, tol=tol, grad_req="add")
+
+ # 2D deconvolution
sym = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), name='deconv')
ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float64}},
{'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float32}},
@@ -575,24 +593,24 @@ def test_deconvolution_with_type():
def test_deconvolution_options():
-# # 1D convolution (not yet enabled)
-# ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float64}},
-# {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float32}},
-# {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float16}},
-# {'ctx': mx.cpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float64}},
-# {'ctx': mx.cpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float32}}]
-# # Pad > 0
-# sym = mx.sym.Convolution(num_filter=3, kernel=(3,), pad=(1,), name='conv')
-# sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,), pad=(1,), cudnn_off=True, name='conv')
-# check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
-# # Stride > 1
-# sym = mx.sym.Convolution(num_filter=3, kernel=(3,), stride=(2,), name='conv')
-# sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,), stride=(2,), cudnn_off=True, name='conv')
-# check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
-# # Dilate > 1
-# sym = mx.sym.Convolution(num_filter=3, kernel=(3,), dilate=(2,), name='conv')
-# sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,), dilate=(2,), cudnn_off=True, name='conv')
-# check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
+ # 1D deconvolution
+ ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float64}},
+ {'ctx': mx.gpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float32}},
+ {'ctx': mx.gpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float16}},
+ {'ctx': mx.cpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float64}},
+ {'ctx': mx.cpu(0), 'deconv_data': (2, 2, 7), 'type_dict': {'deconv_data': np.float32}}]
+ # Pad > 0
+ sym = mx.sym.Deconvolution(layout='NCW', num_filter=3, kernel=(3,), pad=(1,), name='deconv')
+ sym_no_cudnn = mx.sym.Deconvolution(num_filter=3, kernel=(3,), pad=(1,), cudnn_off=True, name='deconv')
+ check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
+ # Stride > 1
+ sym = mx.sym.Deconvolution(layout='NCW', num_filter=3, kernel=(3,), stride=(2,), name='deconv')
+ sym_no_cudnn = mx.sym.Deconvolution(num_filter=3, kernel=(3,), stride=(2,), cudnn_off=True, name='deconv')
+ check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
+ # Dilate > 1
+ sym = mx.sym.Deconvolution(layout='NCW', num_filter=3, kernel=(3,), dilate=(2,), name='deconv')
+ sym_no_cudnn = mx.sym.Deconvolution(num_filter=3, kernel=(3,), dilate=(2,), cudnn_off=True, name='deconv')
+ check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
# 2D deconvolution
ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float64}},
@@ -613,7 +631,7 @@ def test_deconvolution_options():
sym_no_cudnn = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), dilate=(2,2), cudnn_off=True, name='deconv')
check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
-# # 3D convolution (not yet enabled)
+# # 3D deconvolution (not yet enabled)
# ctx_list = [{'ctx': mx.cpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float64}},
# {'ctx': mx.cpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float64}},
# {'ctx': mx.gpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float64}},
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 0230d5f..3fbf98b 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -788,8 +788,9 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
During backward(), if the input of A equals output of B, and the output
of A equals input of B, then the grad of weight should be the same;
"""
- stride = (1, 1)
- kernel = (2*pad[0]+1, 2*pad[1]+1)
+ ndim = len(pad)
+ stride = (1,) * ndim
+ kernel = tuple(2 * np.array(pad) + 1)
data_conv = mx.sym.Variable(name="data_conv")
conv = mx.sym.Convolution(
data=data_conv, kernel=kernel, stride=stride, pad=pad,
@@ -848,10 +849,14 @@ def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, targ
data=data, kernel=kernel, stride=stride, pad=pad, adj=adj, num_filter=5)
arg_names = deconv.list_arguments()
arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape)
- assert out_shapes[0] == (input_shape[0], 5, 8, 8)
+ default_target_size = 8
+ if target_shape is None:
+ target_shape = (default_target_size,) * len(kernel)
+ assert out_shapes[0] == (input_shape[0], 5) + target_shape
def test_deconvolution():
+ # 2D
check_deconvolution_target_shape(
input_shape = (2,3,4,4),
kernel = (3,3),
@@ -898,6 +903,53 @@ def test_deconvolution():
num_filter = 3,
pad = (3,3)
)
+ # 1D
+ check_deconvolution_target_shape(
+ input_shape = (2,3,4),
+ kernel = (3,),
+ stride = (2,),
+ target_shape = (8,),
+ pad = (99,), # will be ignored
+ adj = (101,), # will be ignored
+ )
+ check_deconvolution_target_shape(
+ input_shape = (2,3,4),
+ kernel = (3,),
+ stride = (2,),
+ pad = (1,),
+ adj = (1,),
+ )
+ check_deconvolution_forward_backward(
+ input_shape = (1,1,5),
+ num_filter = 1,
+ kernel = (3,),
+ stride = (1,),
+ pad = (1,)
+ )
+ check_deconvolution_forward_backward(
+ input_shape = (32,3,28),
+ num_filter = 3,
+ kernel = (3,),
+ stride = (1,),
+ pad = (1,)
+ )
+ check_deconvolution_forward_backward(
+ input_shape = (10, 3, 403),
+ num_filter = 3,
+ kernel = (7,),
+ stride = (5,),
+ pad = (2,)
+ )
+ check_deconvolution_gradient(
+ input_shape = (1,3,5),
+ num_filter = 3,
+ pad = (1,)
+ )
+ check_deconvolution_gradient(
+ input_shape = (5,3,100),
+ num_filter = 3,
+ pad = (3,)
+ )
def check_nearest_upsampling_with_shape(shapes, scale, root_scale):
@@ -1022,74 +1074,79 @@ def test_batchnorm_training():
def test_convolution_grouping():
- num_filter = 4
- num_group = 2
- kernel = (3, 3)
- shape = (1, 4, 9, 9)
+ for dim in [1, 2, 3]:
+ num_filter = 4
+ num_group = 2
+ kernel = (3,) * dim
+ shape = (1, 4) + (9,) * dim
- x = mx.sym.Variable('x')
- w = mx.sym.Variable('w')
- b = mx.sym.Variable('b')
- y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, kernel=kernel)
- xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
- wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
- bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
- y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
- num_filter=num_filter//num_group, kernel=kernel)
- for i in range(num_group)])
-
- exe1 = y1.simple_bind(default_context(), x=shape)
- exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group, kernel[0], kernel[1]), b=(num_filter,))
- for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
- arr1[:] = np.random.normal(size=arr1.shape)
- arr2[:] = arr1
- exe1.forward(is_train=True)
- exe1.backward(exe1.outputs[0])
- exe2.forward(is_train=True)
- exe2.backward(exe2.outputs[0])
-
- for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
- np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
+ x = mx.sym.Variable('x')
+ w = mx.sym.Variable('w')
+ b = mx.sym.Variable('b')
+ y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, kernel=kernel)
+ xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
+ wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
+ bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
+ y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
+ num_filter=num_filter//num_group, kernel=kernel)
+ for i in range(num_group)])
+
+ exe1 = y1.simple_bind(default_context(), x=shape)
+ exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
+ for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
+ arr1[:] = np.random.normal(size=arr1.shape)
+ arr2[:] = arr1
+ exe1.forward(is_train=True)
+ exe1.backward(exe1.outputs[0])
+ exe2.forward(is_train=True)
+ exe2.backward(exe2.outputs[0])
+
+ for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
+ np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/8712")
def test_depthwise_convolution():
- for num_base in [1, 4, 16, 32, 64]:
- for kernel in [(3,3), (5,5)]:
- for stride in [(1,1), (2,2)]:
- for pad in [(0,0), (1,1)]:
- for in_size in [7, 32]:
- num_filter = num_base
- num_group = num_base
- shape = (2, num_base, in_size, in_size)
-
- x = mx.sym.Variable('x')
- w = mx.sym.Variable('w')
- b = mx.sym.Variable('b')
- y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group,
- kernel=kernel, stride=stride, pad=pad)
- xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
- wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
- bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
- y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
- num_filter=num_filter//num_group, kernel=kernel,
- stride=stride, pad=pad)
- for i in range(num_group)])
-
- dev = default_context()
- exe1 = y1.simple_bind(dev, x=shape)
- exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group, kernel[0], kernel[1]),
- b=(num_filter,))
- for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
- arr1[:] = np.random.normal(size=arr1.shape)
- arr2[:] = arr1
- exe1.forward(is_train=True)
- exe1.backward(exe1.outputs[0])
- exe2.forward(is_train=True)
- exe2.backward(exe2.outputs[0])
-
- for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
- np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
+ for dim in [1,2]:
+ for num_base in [1, 4, 16, 32, 64]:
+ for kernel_x in [3, 5]:
+ for stride_x in [1, 2]:
+ for pad_x in [0, 1]:
+ for in_size in [7, 32]:
+ kernel = (kernel_x,) * dim
+ stride = (stride_x,) * dim
+ pad = (pad_x,) * dim
+ num_filter = num_base
+ num_group = num_base
+ shape = (2, num_base) + (in_size,) * dim
+
+ x = mx.sym.Variable('x')
+ w = mx.sym.Variable('w')
+ b = mx.sym.Variable('b')
+ y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group,
+ kernel=kernel, stride=stride, pad=pad)
+ xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
+ wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
+ bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
+ y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
+ num_filter=num_filter//num_group, kernel=kernel,
+ stride=stride, pad=pad)
+ for i in range(num_group)])
+
+ dev = default_context()
+ exe1 = y1.simple_bind(dev, x=shape)
+ exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group)+kernel,
+ b=(num_filter,))
+ for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
+ arr1[:] = np.random.normal(size=arr1.shape)
+ arr2[:] = arr1
+ exe1.forward(is_train=True)
+ exe1.backward(exe1.outputs[0])
+ exe2.forward(is_train=True)
+ exe2.backward(exe2.outputs[0])
+
+ for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
+ np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
def gen_broadcast_data(idx):
@@ -1361,9 +1418,14 @@ def test_broadcast_binary_op():
def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False):
+ dim = len(dil)
+ assert(len(kernel_shape) == dim)
# Input for spike response
- spike_imgs = np.zeros(shape=(1,1,33,33), dtype=np.float32)
- spike_imgs[0,0,16,16] = 1.0
+ data_size = 33
+ data_shape = (1, 1) + (data_size,) * dim
+ center = (0,0) + (data_size // 2,) * dim
+ spike_imgs = np.zeros(shape=data_shape, dtype=np.float32)
+ spike_imgs[center] = 1.0
spike_img = mx.nd.array(spike_imgs)
spike_img2 = mx.nd.array(spike_imgs)
@@ -1381,14 +1443,14 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
ndo = be.outputs[0]
out_grads = np.zeros(shape=be.outputs[0].shape, dtype=np.float32)
- out_grads[0,0, 16,16] = 1.0
+ out_grads[center] = 1.0
out_grad = mx.nd.array(out_grads)
be.backward([out_grad])
vgrad = be.grad_arrays[0].asnumpy()
- out = out_o.reshape((out_o.shape[2],out_o.shape[3]))
- nzx,nzy = np.nonzero(out)
- assert(np.sum(out)==np.prod(kernel_shape))
- assert(np.sum(vgrad)==np.prod(kernel_shape))
+ out = out_o.reshape(out_o.shape[2:])
+ nz_loc = np.nonzero(out)
+ assert_allclose(np.sum(out),np.prod(kernel_shape),atol=1e-5)
+ assert_allclose(np.sum(vgrad),np.prod(kernel_shape),atol=1e-5)
# Now check whether the input gradient was computed correctly
input_grad = mx.nd.array(vgrad)
@@ -1396,15 +1458,15 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
be = net.bind(default_context(), args={ 'input' : input_grad, 'test_convolution_weight' : kernel_weights})
be.forward(True)
out_o = be.outputs[0].asnumpy()
- assert(out_o[0,0,16,16]==np.prod(kernel_shape))
+ assert_allclose(out_o[center],np.prod(kernel_shape),atol=1e-5)
rnd_kernel_s = np.random.uniform(low=0.0, high=1.0, size=tuple([1,1]+list(kernel_shape))).astype(np.float32)
impulse_error = mx.nd.array(out_o/np.sum(out_o)) # This should be 1.0 at [0,0,16,16]
rnd_kernel = mx.nd.array(rnd_kernel_s)
rnd_kernel2 = mx.nd.array(rnd_kernel_s)
- white_in = mx.nd.ones(shape=(1,1,33,33))
- white_in2 = mx.nd.ones(shape=(1,1,33,33))
+ white_in = mx.nd.ones(shape=data_shape)
+ white_in2 = mx.nd.ones(shape=data_shape)
be = net.bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : rnd_kernel},
args_grad={'input' : white_in2, 'test_convolution_weight' : rnd_kernel2 } )
@@ -1421,10 +1483,15 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
be.forward(True)
out = be.outputs[0].asnumpy()
# Now do a simple check of the kernel gradient
- assert(out[0,0,16,16] - np.sum(kernel_gradient) - out_orig[0,0,16,16] < 0.001)
+ assert(out[center] - np.sum(kernel_gradient) - out_orig[center] < 0.001)
def test_convolution_dilated_impulse_response():
+ # 1D
+ for dil in [ (1,), (2,), (3,) ]:
+ for ks in [ (1,), (2,), (3,), (4,)]:
+ test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
+ # 2D
for dil in [ (1,1), (2,2), (3,3) ]:
for ks in [ (3,3), (4,4), (2,3), (3,2), (1,1) ]:
test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
--
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].