You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by jx...@apache.org on 2017/08/16 17:32:16 UTC
[incubator-mxnet] branch master updated: add depthwise
convolution's gpu version optimization (#7393)
This is an automated email from the ASF dual-hosted git repository.
jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new d7d31b2 add depthwise convolution's gpu version optimization (#7393)
d7d31b2 is described below
commit d7d31b2d9ce2eee98a3b9f41bc8c526a3125ce78
Author: shuqian.qu <qs...@gmail.com>
AuthorDate: Thu Aug 17 01:32:14 2017 +0800
add depthwise convolution's gpu version optimization (#7393)
* add depthwise convolution's gpu version optimization
* add more config for test_depthwise_convolution
* remove CUDA_1D_KERNEL_LOOP
* fix windows compiling error
* add support for kAddTo when cal input's backward
* remove depthwise_conv_off params
* Update convolution.cu
* Update test_operator.py
---
src/common/cuda_utils.h | 28 +-
src/operator/convolution.cu | 14 +
src/operator/depthwise_convolution-inl.h | 349 +++++++++++++++
src/operator/depthwise_convolution_tf.cuh | 703 ++++++++++++++++++++++++++++++
tests/python/unittest/test_operator.py | 38 ++
5 files changed, 1131 insertions(+), 1 deletion(-)
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 8897007..483390f 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -153,6 +153,16 @@ inline const char* CurandGetErrorString(curandStatus_t status) {
return "Unknown cuRAND status";
}
+template <typename DType>
+inline DType __device__ CudaMax(DType a, DType b) {
+ return a > b ? a : b;
+}
+
+template <typename DType>
+inline DType __device__ CudaMin(DType a, DType b) {
+ return a < b ? a : b;
+}
+
} // namespace cuda
} // namespace common
} // namespace mxnet
@@ -219,6 +229,14 @@ inline const char* CurandGetErrorString(curandStatus_t status) {
<< "cuRAND: " << common::cuda::CurandGetErrorString(e); \
}
+#if !defined(_MSC_VER)
+#define CUDA_UNROLL _Pragma("unroll")
+#define CUDA_NOUNROLL _Pragma("nounroll")
+#else
+#define CUDA_UNROLL
+#define CUDA_NOUNROLL
+#endif
+
/*!
* \brief Determine major version number of the gpu's cuda compute architecture.
* \param device_id The device index of the cuda-capable gpu of interest.
@@ -291,7 +309,6 @@ inline bool GetEnvAllowTensorCore() {
return dmlc::GetEnv("MXNET_CUDA_ALLOW_TENSOR_CORE",
dmlc::optional<bool>(default_value)).value();
}
-
#endif // MXNET_USE_CUDA
#if MXNET_USE_CUDNN
@@ -401,6 +418,15 @@ static inline __device__ void atomicAdd(mshadow::half::half_t *address,
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
}
+
+template <typename DType>
+__device__ inline DType ldg(const DType* address) {
+#if __CUDA_ARCH__ >= 350
+ return __ldg(address);
+#else
+ return *address;
+#endif
+}
#endif
#endif // MXNET_COMMON_CUDA_UTILS_H_
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
index ab35484..f5777c1 100644
--- a/src/operator/convolution.cu
+++ b/src/operator/convolution.cu
@@ -29,6 +29,8 @@
#include "./cudnn_convolution-inl.h"
#endif // MXNET_USE_CUDNN
+#include "./depthwise_convolution-inl.h"
+
namespace mxnet {
namespace op {
@@ -45,6 +47,18 @@ Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
})
return op;
}
+
+ // depth wise conv
+ if (param.num_filter == param.num_group &&
+ param.layout.value() == mshadow::kNCHW &&
+ param.num_filter == (*in_shape)[conv::kData][1] &&
+ param.kernel.ndim() == 2 &&
+ param.dilate == mshadow::Shape2(1, 1) &&
+ dtype == mshadow::kFloat32) {
+ op = new DepthwiseConvolutionOp<float>(param, *in_shape, *out_shape);
+ return op;
+ }
+
#if MXNET_USE_CUDNN == 1
// The NVIDIA Pascal architecture was the first to include 16-bit ALUs.
// Thus, when the framework is compiled with MSHADOW_USE_PASCAL == 1, we
diff --git a/src/operator/depthwise_convolution-inl.h b/src/operator/depthwise_convolution-inl.h
new file mode 100644
index 0000000..5beea45
--- /dev/null
+++ b/src/operator/depthwise_convolution-inl.h
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file depthwise_convolution-inl.h
+ * \brief CUDA depthwise convolution code
+ * \author shuqian.qu@hobot.cc
+*/
+#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_INL_H_
+#include <algorithm>
+#include <vector>
+#include "./convolution-inl.h"
+#include "../common/cuda_utils.h"
+
+#if MXNET_USE_CUDA
+#include <cub/cub.cuh>
+#include "./depthwise_convolution_tf.cuh"
+
+namespace mxnet {
+namespace op {
+using namespace tf::depthwise_conv;
+template<typename DType>
+class DepthwiseConvolutionOp : public Operator {
+ public:
+ explicit DepthwiseConvolutionOp(const ConvolutionParam& param,
+ const std::vector<TShape>& in_shape,
+ const std::vector<TShape>& out_shape) {
+ args_.batch = in_shape[conv::kData][0];
+ args_.in_channel = in_shape[conv::kData][1];
+ args_.in_height = in_shape[conv::kData][2];
+ args_.in_width = in_shape[conv::kData][3];
+ args_.filter_height = in_shape[conv::kWeight][2];
+ args_.filter_width = in_shape[conv::kWeight][3];
+ args_.stride_height = param.stride[0];
+ args_.stride_width = param.stride[1];
+ args_.pad_height = param.pad[0];
+ args_.pad_width = param.pad[1];
+ args_.out_channel = out_shape[conv::kOut][1];
+ args_.out_height = out_shape[conv::kOut][2];
+ args_.out_width = out_shape[conv::kOut][3];
+ bias_term_ = !param.no_bias;
+ }
+
+ ~DepthwiseConvolutionOp() {}
+
+ virtual void Forward(const OpContext &ctx,
+ const std::vector<TBlob> &in_data,
+ const std::vector<OpReqType> &req,
+ const std::vector<TBlob> &out_data,
+ const std::vector<TBlob> &aux_args);
+
+ virtual void Backward(const OpContext &ctx,
+ const std::vector<TBlob> &out_grad,
+ const std::vector<TBlob> &in_data,
+ const std::vector<TBlob> &out_data,
+ const std::vector<OpReqType> &req,
+ const std::vector<TBlob> &in_grad,
+ const std::vector<TBlob> &aux_args);
+
+ private:
+ DepthwiseArgs args_;
+ bool bias_term_;
+}; // class DepthwiseConvolutionOp
+
+namespace depthwise_conv {
+namespace cuda {
+template<typename DType, int kFilterWidth, int kFilterHeight>
+__global__ void __launch_bounds__(1024, 2)
+DepthwiseConv2dBackwardFilterKernel(const DepthwiseArgs args,
+ const DType* out_grad,
+ const DType* input,
+ DType* filter_grad) {
+ const int in_height = args.in_height;
+ const int in_width = args.in_width;
+ const int channel = args.in_channel;
+ const int filter_height = kFilterHeight > 0 ? kFilterHeight : args.filter_height;
+ const int filter_width = kFilterWidth > 0 ? kFilterWidth : args.filter_width;
+ const int stride_height = args.stride_height;
+ const int stride_width = args.stride_width;
+ const int pad_height = args.pad_height;
+ const int pad_width = args.pad_width;
+ const int out_height = args.out_height;
+ const int out_width = args.out_width;
+
+ const int filter_pixels = filter_width * filter_height;
+ const int out_pixels = out_height * out_width;
+ const int in_pixels = in_height * in_width;
+ const int batch_channel_num = channel * args.batch;
+ const int candidate_reduce_thread_num = out_pixels % blockDim.x;
+
+ for (int b = blockIdx.x; b < batch_channel_num; b += gridDim.x) {
+ const int local_batch = b / channel;
+ const int local_channel = b % channel;
+ const int filter_offset_temp = local_channel * filter_pixels;
+ const int out_grad_offset_temp = (local_batch * channel * out_pixels) +
+ (local_channel * out_pixels);
+
+ for (int out_id = threadIdx.x; out_id < out_pixels; out_id += blockDim.x) {
+ const int reduce_thread_num = ((out_pixels - out_id) > candidate_reduce_thread_num) ?
+ blockDim.x : candidate_reduce_thread_num;
+
+ const int out_w = out_id % out_width;
+ const int out_h = (out_id / out_width) % out_height;
+ const int out_grad_offset = out_grad_offset_temp + (out_h * out_width) + (out_w);
+ const DType out_g = ldg(out_grad + out_grad_offset);
+
+ const int in_h_start = out_h * stride_height - pad_height;
+ const int in_w_start = out_w * stride_width - pad_width;
+ CUDA_UNROLL for (int f_h = 0; f_h < filter_height; ++f_h) {
+ const int in_h = in_h_start + f_h;
+ const int input_offset_temp = (local_batch * channel * in_pixels) +
+ (local_channel * in_pixels) + (in_h * in_width);
+ const int filter_offset_h = filter_width * f_h;
+
+ CUDA_UNROLL for (int f_w = 0; f_w < filter_width; ++f_w) {
+ const int in_w = in_w_start + f_w;
+ DType partial_grad = DType(0.0f);
+ if (in_h >= 0 && in_h < in_height && in_w >= 0 && in_w < in_width) {
+ const int input_offset = input_offset_temp + in_w;
+ partial_grad = ldg(input + input_offset) * out_g;
+ }
+ // reduce all valid partial grad in a block
+ typedef cub::BlockReduce<DType, mshadow::cuda::kBaseThreadNum> BlockReduceT;
+ __shared__ typename BlockReduceT::TempStorage temp_storage_reduce;
+ DType aggregate = BlockReduceT(temp_storage_reduce).Sum(partial_grad, reduce_thread_num);
+ if (threadIdx.x == 0) {
+ DType* addr = filter_grad + f_w + filter_offset_h + filter_offset_temp;
+ atomicAdd(addr, aggregate);
+ }
+ __syncthreads();
+ } // for filter_width
+ } // for filter_height
+ } // for out_pixels
+ __syncthreads();
+ } // for batch_channel_num
+}
+} // namespace cuda
+
+template<typename DType>
+void DepthwiseConv2dForwardGpu(mshadow::Stream<gpu> *stream,
+ const DepthwiseArgs& args,
+ const std::vector<TBlob> &in_data,
+ const std::vector<TBlob> &out_data) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ using namespace tf::depthwise_conv;
+ using namespace tf::depthwise_conv::cuda;
+ Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(stream);
+ Tensor<gpu, 4, DType> weight = in_data[conv::kWeight].get<gpu, 4, DType>(stream);
+ Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(stream);
+
+ // select kernel
+ if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
+ LaunchDepthwiseConv2dGPUSmall<DType, DIRECTION_FORWARD>(
+ stream,
+ args,
+ data.dptr_,
+ weight.dptr_,
+ out.dptr_);
+ } else {
+ int num_output = out_data[conv::kOut].shape_.Size();
+ int block_num = std::min(num_output/mshadow::cuda::kBaseThreadNum + 1,
+ mshadow::cuda::kMaxGridNum);
+ auto s = mshadow::Stream<gpu>::GetStream(stream);
+ if (args.filter_height == 3 && args.filter_width == 3) {
+ DepthwiseConv2dForwardKernel<DType, 3, 3>
+ <<<block_num, mshadow::cuda::kBaseThreadNum, 0, s>>>(data.dptr_,
+ weight.dptr_,
+ args,
+ num_output,
+ out.dptr_);
+ } else {
+ DepthwiseConv2dForwardKernel<DType, -1, -1>
+ <<<block_num, mshadow::cuda::kBaseThreadNum, 0, s>>>(data.dptr_,
+ weight.dptr_,
+ args,
+ num_output,
+ out.dptr_);
+ }
+ MSHADOW_CUDA_POST_KERNEL_CHECK(DepthwiseConv2dForwardKernel);
+ }
+}
+
+template<typename DType>
+void DepthwiseConv2dBackwardDataGpu(mshadow::Stream<gpu> *stream,
+ const DepthwiseArgs& args,
+ const std::vector<TBlob> &out_grad,
+ const std::vector<TBlob> &in_data,
+ const std::vector<TBlob> &in_grad) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ using namespace tf::depthwise_conv;
+ using namespace tf::depthwise_conv::cuda;
+ Tensor<gpu, 4, DType> out_g = out_grad[conv::kOut].get<gpu, 4, DType>(stream);
+ Tensor<gpu, 4, DType> weight = in_data[conv::kWeight].get<gpu, 4, DType>(stream);
+ Tensor<gpu, 4, DType> in_data_g = in_grad[conv::kData].get<gpu, 4, DType>(stream);
+ // select kernel
+ if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
+ LaunchDepthwiseConv2dGPUSmall<DType, DIRECTION_BACKWARD>(
+ stream,
+ args,
+ out_g.dptr_,
+ weight.dptr_,
+ in_data_g.dptr_);
+ } else {
+ int num_in_grad = in_grad[conv::kData].shape_.Size();
+ auto s = mshadow::Stream<gpu>::GetStream(stream);
+ int block_num = std::min(num_in_grad/mshadow::cuda::kBaseThreadNum + 1,
+ mshadow::cuda::kMaxGridNum);
+ DepthwiseConv2dBackwardDataKernel<DType>
+ <<<block_num, mshadow::cuda::kBaseThreadNum, 0, s>>>(args,
+ out_g.dptr_,
+ weight.dptr_,
+ in_data_g.dptr_,
+ num_in_grad);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(DepthwiseConv2dBackwardDataKernel);
+ }
+}
+
+template<typename DType>
+void DepthwiseConv2dBackwardFilterGpu(mshadow::Stream<gpu> *stream,
+ const DepthwiseArgs& args,
+ const std::vector<TBlob> &out_grad,
+ const std::vector<TBlob> &in_data,
+ const std::vector<TBlob> &in_grad) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ using namespace tf::depthwise_conv;
+ Tensor<gpu, 4, DType> out_g = out_grad[conv::kOut].get<gpu, 4, DType>(stream);
+ Tensor<gpu, 4, DType> in_d = in_data[conv::kData].get<gpu, 4, DType>(stream);
+ Tensor<gpu, 4, DType> weight_grad = in_grad[conv::kWeight].get<gpu, 4, DType>(stream);
+ // select kernel
+ if (TryLaunchDepthwiseConv2dBackwardFilterGPUSmall<DType>(stream, args,
+ out_g.dptr_,
+ in_d.dptr_,
+ weight_grad.dptr_)) {
+ return;
+ } else {
+ int num_out_grad = out_grad[conv::kOut].shape_.Size();
+ auto s = mshadow::Stream<gpu>::GetStream(stream);
+ int block_num = std::min(args.out_channel * args.batch, mshadow::cuda::kMaxGridNum);
+ if (args.filter_width == 3 && args.filter_height == 3) {
+ cuda::DepthwiseConv2dBackwardFilterKernel<DType, 3, 3>
+ <<<block_num, mshadow::cuda::kBaseThreadNum, 0, s>>>(args,
+ out_g.dptr_,
+ in_d.dptr_,
+ weight_grad.dptr_);
+ } else {
+ cuda::DepthwiseConv2dBackwardFilterKernel<DType, -1, -1>
+ <<<block_num, mshadow::cuda::kBaseThreadNum, 0, s>>>(args,
+ out_g.dptr_,
+ in_d.dptr_,
+ weight_grad.dptr_);
+ }
+ MSHADOW_CUDA_POST_KERNEL_CHECK(DepthwiseConv2dBackwardFilterKernel);
+ }
+}
+} // namespace depthwise_conv
+
+template<typename DType>
+void DepthwiseConvolutionOp<DType>::Forward(const OpContext &ctx,
+ const std::vector<TBlob> &in_data,
+ const std::vector<OpReqType> &req,
+ const std::vector<TBlob> &out_data,
+ const std::vector<TBlob> &aux_states) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ auto stream = ctx.get_stream<gpu>();
+ CHECK_EQ(req[conv::kOut], kWriteTo);
+ // output forward
+ depthwise_conv::DepthwiseConv2dForwardGpu<DType>(stream, args_, in_data, out_data);
+
+ // bias forward
+ if (bias_term_) {
+ Tensor<gpu, 1, DType> bias = in_data[conv::kBias].get<gpu, 1, DType>(stream);
+ Tensor<gpu, 3, DType> output_3d = out_data[conv::kOut].get_with_shape<gpu, 3, DType>(
+ Shape3(args_.batch, args_.out_channel, args_.out_height * args_.out_width), stream);
+ // has bias term, broadcast it to the same shape of output_3d in channel dim
+ output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_);
+ }
+}
+
+template<typename DType>
+void DepthwiseConvolutionOp<DType>::Backward(const OpContext &ctx,
+ const std::vector<TBlob> &out_grad,
+ const std::vector<TBlob> &in_data,
+ const std::vector<TBlob> &out_data,
+ const std::vector<OpReqType> &req,
+ const std::vector<TBlob> &in_grad,
+ const std::vector<TBlob> &aux_states) {
+ using namespace mshadow;
+ using namespace mshadow::expr;
+ auto stream = ctx.get_stream<gpu>();
+ // backward data
+ if (req[conv::kData] != kNullOp) {
+ if (req[conv::kData] != kAddTo) {
+ mshadow::Tensor<gpu, 4, DType> igrad = in_grad[conv::kData].get<gpu, 4, DType>(stream);
+ igrad = 0.0f;
+ }
+ depthwise_conv::DepthwiseConv2dBackwardDataGpu<DType>(stream,
+ args_,
+ out_grad,
+ in_data,
+ in_grad);
+ }
+
+ // backward filter
+ if (req[conv::kWeight] != kNullOp) {
+ if (req[conv::kWeight] != kAddTo) {
+ mshadow::Tensor<gpu, 4, DType> wgrad = in_grad[conv::kWeight].get<gpu, 4, DType>(stream);
+ wgrad = 0.0f;
+ }
+ depthwise_conv::DepthwiseConv2dBackwardFilterGpu<DType>(stream,
+ args_,
+ out_grad,
+ in_data,
+ in_grad);
+ }
+
+ // backward bias
+ if (bias_term_) {
+ Tensor<gpu, 1, DType> dbias = in_grad[conv::kBias].get<gpu, 1, DType>(stream);
+ Tensor<gpu, 3, DType> dout = out_grad[conv::kOut].get_with_shape<gpu, 3, DType>(
+ Shape3(args_.batch, args_.out_channel, args_.out_height * args_.out_width), stream);
+ ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout));
+ }
+}
+} // namespace op
+} // namespace mxnet
+#endif
+
+#endif // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_INL_H_
diff --git a/src/operator/depthwise_convolution_tf.cuh b/src/operator/depthwise_convolution_tf.cuh
new file mode 100644
index 0000000..a1538b6
--- /dev/null
+++ b/src/operator/depthwise_convolution_tf.cuh
@@ -0,0 +1,703 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file depthwise_convolution_tf.cuh
+ * \brief some depthwise convolution CUDA kernel code. The main logic comes
+ * from tensorflow, but the filter's layerout and many argument names
+ * are different with origin version.
+ * \author shuqian.qu@hobot.cc
+*/
+#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
+#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
+#include "../common/cuda_utils.h"
+#include "./mxnet_op.h"
+
+namespace tf {
+namespace depthwise_conv {
+struct DepthwiseArgs {
+ // Input layer dimensions
+ int batch;
+ int in_height;
+ int in_width;
+ int in_channel;
+ int filter_height;
+ int filter_width;
+ int stride_height;
+ int stride_width;
+ int pad_height;
+ int pad_width;
+
+ // Output layer dimensions
+ int out_height;
+ int out_width;
+ int out_channel;
+};
+
+namespace cuda {
+template<typename DType, int kFilterHeight, int kFilterWidth>
+__global__ void __launch_bounds__(1024, 2)
+DepthwiseConv2dForwardKernel(const DType* input,
+ const DType* filter,
+ const DepthwiseArgs args,
+ int num_outputs,
+ DType* output) {
+ const int in_channel = args.in_channel;
+ const int in_height = args.in_height;
+ const int in_width = args.in_width;
+ const int filter_height = kFilterHeight > 0 ? kFilterHeight : args.filter_height;
+ const int filter_width = kFilterWidth > 0 ? kFilterWidth : args.filter_width;
+ const int stride_height = args.stride_height;
+ const int stride_width = args.stride_width;
+ const int pad_height = args.pad_height;
+ const int pad_width = args.pad_width;
+ const int out_channel = args.out_channel;
+ const int out_height = args.out_height;
+ const int out_width = args.out_width;
+
+ CUDA_KERNEL_LOOP(thread_id, num_outputs) {
+ // Compute the indexes of this thread in the output.
+ //
+ // We want coalesced reads so we make sure that each warp reads
+ // a contiguous chunk of memory.
+ //
+ // THIS IS PROBABLY WRONG, we are not doing coalesced reads
+ // into the input, because of the depth multiplier division...
+ const int out_w = thread_id % out_width;
+ const int out_h = (thread_id / out_width) % out_height;
+ const int out_c = (thread_id / out_width / out_height) % out_channel;
+ const int out_b = thread_id / out_width / out_height / out_channel;
+ const int in_c = out_c;
+
+ // Data is stored in the following format (let's assume we
+ // flatten the height and width into one contiguous dimension
+ // called "P".
+ //
+ // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 ....
+ // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 ....
+ //
+ // Each row contains in_channel * in_height * in_width values
+ // for each sample in the batch.
+ //
+ // We can further flatten it into:
+ //
+ // B1C1P1 B1C1P2 .....
+ // B1C2P1 B1C2P2 ....
+ // B2C1P1 B2C1P2 .....
+ // B2C2P1 B2C2P2 ....
+ //
+ // where each row is a contiguous array of all of the spatial
+ // pixels for a given batch and input depth. The following
+ // loop unrolls across the filter dimensions for a given thread,
+ // indexing into the filter value and the corresponding input
+ // patch.
+ //
+ // We can compute the index into the patch once right here.
+ const int input_offset_temp = (out_b * in_channel + in_c) * (in_height * in_width);
+ const int filter_offset_temp = in_c * filter_height * filter_width;
+
+ // Finally, we can iterate over the spatial dimensions and perform the
+ // convolution, writing into the output at the end.
+ //
+ // We perform an additional optimization, where we can determine
+ // whether the patch fits within the image indices statically, and
+ // avoid boundary checking within the loop.
+ const int input_h_start = out_h * stride_height - pad_height;
+ const int input_w_start = out_w * stride_width - pad_width;
+ const int input_h_end = input_h_start + filter_height;
+ const int input_w_end = input_w_start + filter_width;
+
+ DType sum = 0;
+ if (input_h_start >= 0 && input_w_start >= 0 &&
+ input_h_end < in_height && input_w_end < in_width) {
+ // Loop that doesn't need to check for boundary conditions.
+ CUDA_UNROLL for (int f_h = 0; f_h < filter_height; ++f_h) {
+ const int in_h = input_h_start + f_h;
+ const int filter_offset_h = filter_width * f_h;
+ CUDA_UNROLL for (int f_w = 0; f_w < filter_width; ++f_w) {
+ const int in_w = input_w_start + f_w;
+ const int input_offset = (input_offset_temp) + (in_h * in_width) + in_w;
+ const int filter_offset = filter_offset_temp + filter_offset_h + f_w;
+ sum += ldg(input + input_offset) * ldg(filter + filter_offset);
+ }
+ }
+ } else {
+ // Loop that needs to check for boundary conditions.
+ CUDA_UNROLL for (int f_h = 0; f_h < filter_height; ++f_h) {
+ const int in_h = input_h_start + f_h;
+ const int filter_offset_h = filter_width * f_h;
+ CUDA_UNROLL for (int f_w = 0; f_w < filter_width; ++f_w) {
+ const int in_w = input_w_start + f_w;
+ // TODO(vrv): the in_h check can be done outside of this loop;
+ // benchmark both methods to determine the better decision.
+ if (in_h >= 0 && in_h < in_height && in_w >= 0 && in_w < in_width) {
+ const int in_w = input_w_start + f_w;
+ const int input_offset = input_offset_temp + (in_h * in_width) + in_w;
+ const int filter_offset = filter_offset_temp + filter_offset_h + f_w;
+ sum += ldg(input + input_offset) * ldg(filter + filter_offset);
+ }
+ }
+ }
+ }
+ output[thread_id] = sum;
+ }
+}
+
+// The DepthwiseConv2dKernelSmall perform either forward or backward input
+// convolution depending on a template argument of this enum.
+enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD };
+
+// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
+// tailored for small images up to 32x32. Only use this kernel if
+// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+// Backward input direction is the same as forward direction with the filter
+// rotated by 180°.
+template <typename DType, DepthwiseConv2dDirection kDirection,
+ int kBlockSlices, bool kEvenHeight, int kFilterHeight, int kFilterWidth>
+__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dKernelSmall(
+ const DepthwiseArgs args, const DType* input, const DType* filter, DType* output) {
+ extern __shared__ __align__(sizeof(DType)) unsigned char shared_memory[];
+ DType* const shared_data = reinterpret_cast<DType*>(shared_memory);
+
+ const int in_height = args.in_height;
+ const int in_width = args.in_width;
+ const int in_channel = args.in_channel;
+ const int filter_height = kFilterHeight > 0 ? kFilterHeight : args.filter_height;
+ const int filter_width = kFilterWidth > 0 ? kFilterWidth : args.filter_width;
+ const int pad_height = args.pad_height;
+ const int pad_width = args.pad_width;
+
+ // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
+ const int block_height = blockDim.y;
+
+ // These values are the same for all threads and could
+ // be precomputed on the CPU.
+ const int block_pixels = in_width * block_height;
+ const int block_size = block_pixels * kBlockSlices;
+ const int in_pixels = in_width * in_height;
+ const int in_increment = in_width - 1;
+ const int filter_pixels = filter_height * filter_width;
+ const int tile_width = in_width + filter_width - 1;
+ const int even_height = kEvenHeight || (1 & ~in_height);
+ const int tile_height = in_height + filter_height - even_height;
+ const int tile_pixels = tile_width * tile_height;
+ const int tile_size = tile_pixels * kBlockSlices;
+ const int tile_offset = block_height * tile_width;
+ const int pad_offset = pad_height * tile_width + pad_width;
+ const int in_slices = in_channel * args.batch;
+ const int in_blocks = (in_slices + kBlockSlices - 1) / kBlockSlices;
+
+ const int thread_width = threadIdx.x;
+ const int thread_height = threadIdx.y;
+ const int thread_channel = threadIdx.z;
+
+ // Position in block.
+ const int thread_pix = thread_height * in_width + thread_width;
+ const int thread_idx = thread_channel * block_pixels + thread_pix;
+
+ // Initialize tile, in particular the padding.
+ for (int i = thread_idx; i < tile_size; i += block_size) {
+ shared_data[i] = DType(0);
+ }
+ __syncthreads();
+
+ // Position in tensors.
+ const int tensor_idx = thread_channel * in_pixels + thread_pix;
+
+ // Position in (padded) shared memory.
+ const int data_pix = thread_height * tile_width + thread_width;
+ const int data_idx = thread_channel * tile_pixels + data_pix;
+
+ // Position in shared memory, offset by pad_height / pad_width.
+ const int tile_idx = data_idx + pad_offset;
+
+ const int filter_pix = thread_pix;
+ const int filter_channel = thread_channel;
+ const int filter_idx = filter_pixels * filter_channel + filter_pix;
+
+ const int max_slice = in_slices - thread_channel;
+ const int filter_write_offset = filter_pix < filter_pixels ? tile_size + filter_idx : 0;
+ const int filter_read_offset = tile_size +
+ (kDirection == DIRECTION_FORWARD ?
+ filter_pixels * filter_channel : filter_pixels * (filter_channel + 1));
+ const bool skip_second = !kEvenHeight && thread_height + (in_height & 1) == block_height;
+
+ for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+ const int slice = b * kBlockSlices;
+
+ const int inout_offset = slice * in_pixels + tensor_idx;
+ const bool slice_in_range = slice < max_slice;
+
+ if (slice_in_range) {
+ const DType* const in_ptr = inout_offset + input;
+ DType* const tile_ptr = tile_idx + shared_data;
+ tile_ptr[0] = ldg(in_ptr);
+ if (!skip_second) {
+ tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
+ }
+ }
+
+ if (filter_write_offset != 0) {
+ const int filter_offset = ((slice + filter_channel) % in_channel)* filter_pixels + filter_pix;
+ shared_data[filter_write_offset] = ldg(filter_offset + filter);
+ }
+
+ // Note: the condition to reach this is uniform across the entire block.
+ __syncthreads();
+
+ if (slice_in_range) {
+ DType sum1 = 0;
+ DType sum2 = 0;
+ int shared_offset = data_idx;
+ const DType* filter_ptr = filter_read_offset + shared_data;
+ CUDA_UNROLL for (int r = 0; r < filter_height; ++r) {
+ CUDA_UNROLL for (int c = 0; c < filter_width; ++c) {
+ if (kDirection == DIRECTION_BACKWARD) {
+ filter_ptr--;
+ }
+ const DType filter_value = *filter_ptr;
+ const DType* const tile_ptr = shared_offset + shared_data;
+ sum1 += filter_value * tile_ptr[0];
+ sum2 += filter_value * tile_ptr[tile_offset];
+ ++shared_offset;
+ if (kDirection == DIRECTION_FORWARD) {
+ filter_ptr++;
+ }
+ }
+ shared_offset += in_increment;
+ }
+ DType* const out_ptr = inout_offset + output;
+ if (kDirection == DIRECTION_FORWARD) {
+ out_ptr[0] = sum1;
+ if (!skip_second) {
+ out_ptr[block_pixels] = sum2;
+ }
+ } else {
+ out_ptr[0] += sum1;
+ if (!skip_second) {
+ out_ptr[block_pixels] += sum2;
+ }
+ }
+ }
+
+ // Note: the condition to reach this is uniform across the entire block.
+ __syncthreads();
+ }
+}
+
+template<typename DType>
+__global__ void __launch_bounds__(640, 2)
+DepthwiseConv2dBackwardDataKernel(const DepthwiseArgs args,
+ const DType* out_grad,
+ const DType* filter, DType* in_grad,
+ int num_in_grad) {
+ const int channel = args.in_channel;
+ const int in_height = args.in_height;
+ const int in_width = args.in_width;
+ const int filter_height = args.filter_height;
+ const int filter_width = args.filter_width;
+ const int stride_height = args.stride_height;
+ const int stride_width = args.stride_width;
+ const int pad_height = args.pad_height;
+ const int pad_width = args.pad_width;
+ const int out_height = args.out_height;
+ const int out_width = args.out_width;
+
+ const int in_pixels = in_height * in_width;
+ const int out_pixels = out_height * out_width;
+
+ CUDA_KERNEL_LOOP(thread_id, num_in_grad) {
+ // Compute the indexes of this thread in the input.
+ const int in_w = thread_id % in_width;
+ const int in_h = (thread_id / in_width) % in_height;
+ const int channel_idx = (thread_id / in_width / in_height) % channel;
+ const int batch_idx = thread_id / channel / in_width / in_height;
+ DType sum = 0.0f;
+
+ const int out_h_start = mxnet::common::cuda::CudaMax<int>(
+ 0, (in_h - filter_height + pad_height + stride_height) / stride_height);
+ const int out_h_end = mxnet::common::cuda::CudaMin(
+ out_height - 1, (in_h + pad_height) / stride_height);
+ const int out_w_start = mxnet::common::cuda::CudaMax<int>(
+ 0, (in_w - filter_width + pad_width + stride_width) / stride_width);
+ const int out_w_end = mxnet::common::cuda::CudaMin(
+ out_width - 1, (in_w + pad_width) / stride_width);
+
+ const int filter_offset_temp = channel_idx * filter_height * filter_width;
+ const int out_grad_offset_temp = (batch_idx * channel * out_pixels) +
+ (channel_idx * out_pixels);
+
+ for (int out_h = out_h_start; out_h <= out_h_end; ++out_h) {
+ const int f_h = in_h + pad_height - out_h * stride_height;
+ const int filter_offset_h = filter_offset_temp + f_h * filter_width;
+ const int out_grad_offset_h = out_grad_offset_temp + out_h * out_width;
+ for (int out_w = out_w_start; out_w <= out_w_end; ++out_w) {
+ const int f_w = in_w + pad_width - out_w * stride_width;
+ const int filter_offset = filter_offset_h + f_w;
+ const int out_grad_offset = out_grad_offset_h + out_w;
+ sum += ldg(out_grad + out_grad_offset) * ldg(filter + filter_offset);
+ }
+ }
+ const int in_grad_offset = (batch_idx * channel * in_pixels) +
+ (channel_idx * in_pixels) + (in_h * in_width) + (in_w);
+ in_grad[in_grad_offset] += sum;
+ }
+}
+
+// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
+// NCHW format, tailored for small images up to 32x32. Only use this kernel if
+// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input tensor are loaded into shared memory before performing the
+// convolution. Per iteration and filter element, each thread first performs
+// a partial convolution for two elements, one each in the lower and upper half
+// of a tile. The intermediate result of all pixels of a warp are then
+// accumulated and written to shared memory. Finally, the values in shared
+// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
+// up in global memory using atomics.
+// Requirements: threads per block must be multiple of 32 and <= launch_bounds,
+// kAccumPixels * 64 >= args.in_height * args.in_width * kBlockSlices.
+template <typename DType, int kBlockSlices, int kAccumPixels, int kFilterHeight, int kFilterWidth>
+__global__
+__launch_bounds__(1024, 2) void DepthwiseConv2dBackwardFilterKernelSmall(
+ const DepthwiseArgs args, const DType* output, const DType* input, DType* filter) {
+ extern __shared__ __align__(sizeof(DType)) unsigned char shared_memory[];
+ DType* const shared_data = reinterpret_cast<DType*>(shared_memory);
+
+ const int in_height = args.in_height;
+ const int in_width = blockDim.x; // slower (see b/62280718): args.in_width;
+ const int in_channel = args.in_channel;
+ const int filter_height = kFilterHeight > 0 ? kFilterHeight : args.filter_height;
+ const int filter_width = kFilterWidth > 0 ? kFilterWidth : args.filter_width;
+ const int pad_height = args.pad_height;
+ const int pad_width = args.pad_width;
+
+ const int block_height = blockDim.y;
+
+ // These values are the same for all threads and could
+ // be precomputed on the CPU.
+ const int block_pixels = in_width * block_height;
+ const int block_size = block_pixels * kBlockSlices;
+ assert((block_size & 31) == 0);
+ const int in_pixels = in_width * in_height;
+ const int in_increment = in_width - 1;
+ const int filter_pixels = filter_height * filter_width;
+ const int tile_width = in_width + filter_width - 1;
+ const int tile_height = 2 * block_height + filter_height - 1;
+ const int tile_pixels = tile_width * tile_height;
+ const int tile_size = tile_pixels * kBlockSlices;
+ const int tile_offset = block_height * tile_width;
+ const int pad_offset = pad_height * tile_width + pad_width;
+ const int in_slices = in_channel * args.batch;
+ const int in_blocks = (in_slices + kBlockSlices - 1) / kBlockSlices;
+ // The accumulator has a fixed number of pixels that can be reduced by one
+ // warp. Pixels beyond ceil(in_pixels * kBlockSlices / 64) are never written.
+ assert(kAccumPixels * 64 >= in_height * in_width * kBlockSlices);
+ const int accum_increment = kAccumPixels * kBlockSlices;
+ const int accum_size = filter_pixels * accum_increment;
+
+ const int thread_width = threadIdx.x;
+ const int thread_height = threadIdx.y;
+ const int thread_channel = threadIdx.z;
+
+ // Position in block.
+ const int thread_pix = thread_height * in_width + thread_width;
+ const int thread_idx = thread_channel * block_pixels + thread_pix;
+
+ // Initialize tile, in particular the padding and accumulator.
+ for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
+ shared_data[i] = DType(0);
+ }
+ __syncthreads();
+
+ // Position in tensors.
+ const int tensor_idx = thread_channel * in_pixels + thread_pix;
+
+ // Position in (padded) shared memory.
+ const int data_pix = thread_height * tile_width + thread_width;
+ const int data_idx = thread_channel * tile_pixels + data_pix;
+
+ // Position in shared memory, offset by pad_height / pad_width.
+ const int tile_idx = data_idx + pad_offset;
+
+ // Position in accumulator (kBlockSlices per warp, depth major).
+ const int accum_pix = thread_pix / (32 / kBlockSlices);
+ const int accum_idx = thread_channel * kAccumPixels + accum_pix;
+
+ const int max_slice = in_slices - thread_channel;
+ const int accum_offset = tile_size + accum_idx;
+ const bool skip_second = block_height + thread_height >= in_height;
+
+ for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+ const int slice = b * kBlockSlices;
+
+ const int inout_offset = slice * in_pixels + tensor_idx;
+ const bool slice_in_range = slice < max_slice;
+
+ if (slice_in_range) {
+ const DType* const in_ptr = inout_offset + input;
+ DType* const tile_ptr = tile_idx + shared_data;
+ tile_ptr[0] = ldg(in_ptr);
+ if (!skip_second) {
+ tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
+ }
+ }
+
+ // Note: the condition to reach this is uniform across the entire block.
+ __syncthreads();
+
+ if (slice_in_range) {
+ const DType* const out_ptr = inout_offset + output;
+ const DType out1 = ldg(out_ptr);
+ const DType out2 = skip_second ? DType(0) : ldg(block_pixels + out_ptr);
+ int shared_offset = data_idx;
+ DType* accum_ptr = accum_offset + shared_data;
+ CUDA_UNROLL for (int r = 0; r < filter_height; ++r) {
+ CUDA_UNROLL for (int c = 0; c < filter_width; ++c) {
+ const DType* const tile_ptr = shared_offset + shared_data;
+ DType val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
+ // Warp-accumulate pixels of the same depth and write to accumulator.
+ for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) {
+ val += __shfl_down(val, delta);
+ }
+ if (!(thread_idx & 32 / kBlockSlices - 1)) {
+ *accum_ptr = val;
+ }
+ ++shared_offset;
+ accum_ptr += accum_increment;
+ }
+ shared_offset += in_increment;
+ }
+ }
+
+ // Note: the condition to reach this is uniform across the entire block.
+ __syncthreads();
+
+ const DType* const accum_data = tile_size + shared_data;
+ for (int i = thread_idx; i < accum_size; i += block_size) {
+ const int filter_idx = i / kAccumPixels;
+ const int filter_pix = filter_idx / kBlockSlices;
+ const int filter_channel = (slice + filter_idx % kBlockSlices) % in_channel;
+ // convert to CHW
+ const int filter_offset = filter_channel * filter_pixels +
+ (filter_pix/filter_width) * filter_height + filter_pix % filter_width;
+
+ if (filter_channel < in_channel) {
+ DType val = accum_data[i];
+ // Warp-accumulate pixels of the same depth from the accumulator.
+ for (int delta = kAccumPixels / 2; delta > 0; delta /= 2) {
+ val += __shfl_down(val, delta);
+ }
+ if (!(thread_idx & kAccumPixels - 1)) {
+ atomicAdd(filter_offset + filter, val);
+ }
+ }
+ }
+ }
+}
+
+
+} // namespace cuda
+
+// Returns whether depthwise convolution forward or backward input pass can be
+// performed using the faster ('Small') variant of the kernel.
+bool CanLaunchDepthwiseConv2dGPUSmall(const DepthwiseArgs& args) {
+ return args.stride_height == 1 && args.stride_width == 1 && args.in_height <= 32 &&
+ args.in_width <= 32 && args.in_height == args.out_height &&
+ args.in_width == args.out_width && args.pad_height >= 0 &&
+ args.pad_height < args.filter_height && args.pad_width >= 0 &&
+ args.pad_width < args.filter_width &&
+ args.filter_height * args.filter_width <= (args.in_height + 1) / 2 * args.in_width;
+}
+
+// Returns whether depthwise convolution backward filter pass can be performed
+// using the faster ('Small') variant of the kernel.
+bool CanLaunchDepthwiseConv2dBackwardFilterGPUSmall(const DepthwiseArgs args,
+ const int block_height) {
+ return args.stride_height == 1 && args.stride_width == 1 && args.in_height <= 32 &&
+ args.in_width <= 32 && args.in_height == args.out_height &&
+ args.in_width == args.out_width && args.pad_height >= 0 &&
+ args.pad_height < args.filter_height && args.pad_width >= 0 &&
+ args.pad_width < args.filter_width && block_height <= args.in_height &&
+ args.filter_height * args.filter_width <= block_height * args.in_width;
+}
+
+template <typename DType, cuda::DepthwiseConv2dDirection kDirection,
+ int kBlockSlices, bool kEvenHeight>
+void LaunchDepthwiseConv2dGPUSmall(mshadow::Stream<mxnet::gpu> *stream,
+ const DepthwiseArgs args,
+ const DType* input, const DType* filter, DType* output) {
+ const int block_height = (args.in_height + 1) / 2;
+ dim3 block_dim = dim3(args.in_width, block_height, kBlockSlices);
+
+ const int tile_width = args.in_width + args.filter_width - 1;
+ const int tile_height = block_height * 2 + args.filter_height - 1;
+ const int tile_pixels = tile_height * tile_width;
+ const int filter_pixels = args.filter_height * args.filter_width;
+ const int shared_memory_size =
+ kBlockSlices * (tile_pixels + filter_pixels) * sizeof(DType);
+ const int num_outputs =
+ args.batch * args.out_height * args.out_width * args.out_channel;
+ int block_count = std::min(num_outputs/(block_dim.x * block_dim.y * block_dim.z) + 1,
+ (unsigned)mshadow::cuda::kMaxGridNum);
+ auto s = mshadow::Stream<mxnet::gpu>::GetStream(stream);
+ if (args.filter_height == 3 && args.filter_width == 3) {
+ cuda::DepthwiseConv2dKernelSmall<DType, kDirection, kBlockSlices, kEvenHeight, 3, 3>
+ <<<block_count, block_dim, shared_memory_size, s>>>(args, input, filter, output);
+ } else {
+ cuda::DepthwiseConv2dKernelSmall<DType, kDirection, kBlockSlices, kEvenHeight, -1, -1>
+ <<<block_count, block_dim, shared_memory_size, s>>>(args, input, filter, output);
+ }
+ MSHADOW_CUDA_POST_KERNEL_CHECK(DepthwiseConv2dKernelSmall);
+}
+
+template <typename DType, cuda::DepthwiseConv2dDirection kDirection, int kBlockSlices>
+void LaunchDepthwiseConv2dGPUSmall(mshadow::Stream<mxnet::gpu> *stream,
+ const DepthwiseArgs args,
+ const DType* input, const DType* filter, DType* output) {
+ if (args.in_height & 1) {
+ LaunchDepthwiseConv2dGPUSmall<DType, kDirection, kBlockSlices, false>(
+ stream, args, input, filter, output);
+ } else {
+ LaunchDepthwiseConv2dGPUSmall<DType, kDirection, kBlockSlices, true>(
+ stream, args, input, filter, output);
+ }
+}
+
+template <typename DType, cuda::DepthwiseConv2dDirection kDirection>
+void LaunchDepthwiseConv2dGPUSmall(mshadow::Stream<mxnet::gpu> *stream,
+ const DepthwiseArgs args,
+ const DType* input, const DType* filter, DType* output) {
+ // Maximize (power of two) kBlockSlices while keeping a block within 1024
+ // threads (2 pixels per thread).
+ const int block_pixels = (args.in_height + 1) / 2 * args.in_width;
+ if (block_pixels > 256) {
+ LaunchDepthwiseConv2dGPUSmall<DType, kDirection, 2>(stream, args, input, filter, output);
+ } else if (block_pixels > 128) {
+ LaunchDepthwiseConv2dGPUSmall<DType, kDirection, 4>(stream, args, input, filter, output);
+ } else {
+ LaunchDepthwiseConv2dGPUSmall<DType, kDirection, 8>(stream, args, input, filter, output);
+ }
+}
+
+template <typename DType, int kBlockSlices, int kAccumPixels>
+bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream<mxnet::gpu> *stream,
+ const DepthwiseArgs args,
+ const int block_height,
+ const DType* out_grad,
+ const DType* input,
+ DType* filter_grad) {
+ const int tile_width = args.in_width + args.filter_width - 1;
+ const int tile_height = block_height * 2 + args.filter_height - 1;
+ const int tile_pixels = tile_height * tile_width;
+ const int filter_pixels = args.filter_height * args.filter_width;
+ const int shared_memory_size =
+ kBlockSlices * (tile_pixels + filter_pixels * kAccumPixels) * sizeof(DType);
+ if (shared_memory_size > 46 * 1024) {
+ return false;
+ }
+
+ dim3 block_dim = dim3(args.in_width, block_height, kBlockSlices);
+ const int num_out_grad =
+ args.batch * args.out_height * args.out_width * args.out_channel;
+ int block_count = num_out_grad/(block_dim.x * block_dim.y * block_dim.z) + 1;
+ auto s = mshadow::Stream<mxnet::gpu>::GetStream(stream);
+ if (args.filter_height == 3 && args.filter_width == 3) {
+ cuda::DepthwiseConv2dBackwardFilterKernelSmall<DType, kBlockSlices, kAccumPixels, 3, 3>
+ <<<block_count, block_dim, shared_memory_size, s>>>(
+ args, out_grad, input, filter_grad);
+ } else {
+ cuda::DepthwiseConv2dBackwardFilterKernelSmall<DType, kBlockSlices, kAccumPixels, -1, -1>
+ <<<block_count, block_dim, shared_memory_size, s>>>(
+ args, out_grad, input, filter_grad);
+ }
+ MSHADOW_CUDA_POST_KERNEL_CHECK(DepthwiseConv2dBackwardFilterKernelSmall);
+ return true;
+}
+
+template <typename DType, int kBlockSlices>
+bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream<mxnet::gpu> *stream,
+ const DepthwiseArgs args,
+ const int block_height,
+ const DType* out_grad,
+ const DType* input,
+ DType* filter_grad) {
+ // Minimize (power of two) kAccumPixels, while satisfying
+ // kAccumPixels * 32 >= block_height * in_width * kBlockSlices.
+ const int block_pixels = block_height * args.in_width * kBlockSlices;
+ if (block_pixels > 512) {
+ return TryLaunchDepthwiseConv2dBackwardFilterGPUSmall<DType, kBlockSlices, 32>(
+ stream, args, block_height, out_grad, input, filter_grad);
+ } else if (block_pixels > 256) {
+ return TryLaunchDepthwiseConv2dBackwardFilterGPUSmall<DType, kBlockSlices, 16>(
+ stream, args, block_height, out_grad, input, filter_grad);
+ } else {
+ return TryLaunchDepthwiseConv2dBackwardFilterGPUSmall<DType, kBlockSlices, 8>(
+ stream, args, block_height, out_grad, input, filter_grad);
+ }
+}
+
+template <typename DType>
+bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream<mxnet::gpu> *stream,
+ const DepthwiseArgs args,
+ const DType* out_grad,
+ const DType* input,
+ DType* filter_grad) {
+ // Maximize (power of two) kBlockSlices while keeping a block within 1024
+ // threads (2 pixels per thread).
+ int block_slices = 8;
+ int block_height = (args.in_height + 1) / 2;
+ int round_mask = 1;
+ for (; block_slices > 1; block_slices /= 2) {
+ // args.in_width * block_height * kBlockSlices must be multiple of 32.
+ for (; block_height * args.in_width * block_slices & 31;
+ round_mask = round_mask * 2 + 1) {
+ block_height = block_height + round_mask & ~round_mask;
+ }
+ int block_size = block_height * args.in_width * block_slices;
+ if (block_size <= 1024) {
+ break;
+ }
+ }
+
+ if (!CanLaunchDepthwiseConv2dBackwardFilterGPUSmall(args, block_height)) {
+ return false;
+ }
+
+ switch (block_slices) {
+ case 8:
+ return TryLaunchDepthwiseConv2dBackwardFilterGPUSmall<DType, 8>(
+ stream, args, block_height, out_grad, input, filter_grad);
+ case 4:
+ return TryLaunchDepthwiseConv2dBackwardFilterGPUSmall<DType, 4>(
+ stream, args, block_height, out_grad, input, filter_grad);
+ case 2:
+ return TryLaunchDepthwiseConv2dBackwardFilterGPUSmall<DType, 2>(
+ stream, args, block_height, out_grad, input, filter_grad);
+ default:
+ return false;
+ }
+}
+
+} // namespace depthwise_conv
+} // namespace tf
+
+#endif // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 7d56b46..a33cb03 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -955,6 +955,44 @@ def test_convolution_grouping():
for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
+
+def test_depthwise_convolution():
+ for num_base in [32, 64]:
+ for kernel in [(3,3), (5,5)]:
+ for stride in [(1,1), (2,2)]:
+ for pad in [(0,0), (1,1)]:
+ num_filter = num_base
+ num_group = num_base
+ shape = (2, num_base, 32, 32)
+
+ x = mx.sym.Variable('x')
+ w = mx.sym.Variable('w')
+ b = mx.sym.Variable('b')
+ y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group,
+ kernel=kernel, stride=stride, pad=pad)
+ xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
+ wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
+ bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
+ y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
+ num_filter=num_filter//num_group, kernel=kernel,
+ stride=stride, pad=pad)
+ for i in range(num_group)])
+
+ dev = default_context()
+ exe1 = y1.simple_bind(dev, x=shape)
+ exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group, kernel[0], kernel[1]),
+ b=(num_filter,))
+ for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
+ arr1[:] = np.random.normal(size=arr1.shape)
+ arr2[:] = arr1
+ exe1.forward(is_train=True)
+ exe1.backward(exe1.outputs[0])
+ exe2.forward(is_train=True)
+ exe2.backward(exe2.outputs[0])
+
+ for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
+ np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
+
def gen_broadcast_data(idx):
# Manually set test cases
binary_op_data_shape = np.array(
--
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].