You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by sk...@apache.org on 2019/02/11 23:44:49 UTC
[incubator-mxnet] branch master updated: Performance improvement in
ToTensor GPU Kernel (#14099)
This is an automated email from the ASF dual-hosted git repository.
skm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new ab5a0cf Performance improvement in ToTensor GPU Kernel (#14099)
ab5a0cf is described below
commit ab5a0cf6cf87f046d98397edbced251fe6173d6c
Author: Sandeep Krishnamurthy <sa...@gmail.com>
AuthorDate: Mon Feb 11 15:44:17 2019 -0800
Performance improvement in ToTensor GPU Kernel (#14099)
* CPU implementation without Kernel launch/map
* Optimal CUDA support for 3D ToTensor operator
* Add CUDA kernel for 4D inputs
* Fix failing CPU tests for totensor
* disable warning on windows
* try fix in instance norm windows build failure
* Guard omp parallel collapse for windows
* Remove warning supression to check if it is ok
* fix lint issues
* Address code review comments
---
src/operator/image/image_random-inl.h | 100 ++++++++++++++++++++++++----------
src/operator/image/image_random.cu | 83 ++++++++++++++++++++++++++++
2 files changed, 153 insertions(+), 30 deletions(-)
diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index 4480163..392fff4 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -43,8 +43,18 @@ namespace mxnet {
namespace op {
namespace image {
-// There are no parameters for this operator.
-// Hence, no arameter registration.
+using namespace mshadow;
+
+#if MXNET_USE_CUDA
+// NOTE: Kernel launch/map was extremely costly.
+// Hence, we use separate CUDA kernels for these operators.
+template<typename DType, typename T1, typename T2>
+void ToTensorImplCUDA(mshadow::Stream<gpu> *s,
+ const T1 input,
+ const T2 output,
+ const int req,
+ const float normalize_factor);
+#endif // MXNET_USE_CUDA
// Shape and Type inference for image to tensor operator
inline bool ToTensorShape(const nnvm::NodeAttrs& attrs,
@@ -78,37 +88,39 @@ inline bool ToTensorType(const nnvm::NodeAttrs& attrs,
}
// Operator Implementation
-
-template<int req>
-struct totensor_forward {
- template<typename DType>
- MSHADOW_XINLINE static void Map(uint32_t c, float* out_data, const DType* in_data,
- const int length, const int channel, const int step,
- const float normalize_factor = 255.0f) {
- #pragma omp parallel for
+template<typename DType, int req>
+inline void ToTensor(float* out_data, const DType* in_data,
+ const int length,
+ const int channels,
+ const float normalize_factor,
+ const int step) {
+ // Microsoft Visual C++ compiler does not support omp collapse
+ #ifdef _MSC_VER
+ #pragma omp parallel for
+ #else
+ #pragma omp parallel for collapse(2)
+ #endif // _MSC_VER
+ for (int c = 0; c < channels; ++c) {
for (int i = 0; i < length; ++i) {
KERNEL_ASSIGN(out_data[step + c*length + i], req,
- (in_data[step + i*channel + c]) / normalize_factor);
+ (in_data[step + i*channels + c]) / normalize_factor);
}
}
-};
-
-template<typename xpu>
-void ToTensorImpl(const OpContext &ctx,
- const std::vector<TBlob> &inputs,
- const std::vector<TBlob> &outputs,
- const std::vector<OpReqType> &req,
- const int length,
- const uint32_t channel,
- const int step = 0) {
- mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+}
+inline void ToTensorImpl(const std::vector<TBlob> &inputs,
+ const std::vector<TBlob> &outputs,
+ const std::vector<OpReqType> &req,
+ const int length,
+ const int channel,
+ const float normalize_factor,
+ const int step) {
MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
float* output = outputs[0].dptr<float>();
DType* input = inputs[0].dptr<DType>();
- mxnet_op::Kernel<totensor_forward<req_type>, xpu>::Launch(
- s, channel, output, input, length, channel, step);
+ ToTensor<DType, req_type>(output, input, length, channel,
+ normalize_factor, step);
});
});
}
@@ -123,24 +135,52 @@ void ToTensorOpForward(const nnvm::NodeAttrs &attrs,
CHECK_EQ(outputs.size(), 1U);
CHECK_EQ(req.size(), 1U);
+ // We do not use temp buffer when performance the operation.
+ // Hence, this check is necessary.
CHECK_EQ(req[0], kWriteTo)
<< "`to_tensor` does not support inplace updates";
- // 3D Input - (h, w, c)
- if (inputs[0].ndim() == 3) {
+ const float normalize_factor = 255.0f;
+
+ if (std::is_same<xpu, gpu>::value) {
+ #if MXNET_USE_CUDA
+ mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+ MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+ MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+ if (inputs[0].ndim() == 3) {
+ Tensor<gpu, 3, DType> input = inputs[0].get<gpu, 3, DType>(s);
+ Tensor<gpu, 3, float> output = outputs[0].get<gpu, 3, float>(s);
+ ToTensorImplCUDA<DType, Tensor<gpu, 3, DType>, Tensor<gpu, 3, float>>
+ (s, input, output, req_type, normalize_factor);
+ } else {
+ Tensor<gpu, 4, DType> input = inputs[0].get<gpu, 4, DType>(s);
+ Tensor<gpu, 4, float> output = outputs[0].get<gpu, 4, float>(s);
+ ToTensorImplCUDA<DType, Tensor<gpu, 4, DType>, Tensor<gpu, 4, float>>
+ (s, input, output, req_type, normalize_factor);
+ }
+ });
+ });
+ #else
+ LOG(FATAL) << "Compile with USE_CUDA=1 to use ToTensor operator on GPU.";
+ #endif // MXNET_USE_CUDA
+ } else if (inputs[0].ndim() == 3) {
+ // 3D Input - (h, w, c)
const int length = inputs[0].shape_[0] * inputs[0].shape_[1];
- const uint32_t channel = inputs[0].shape_[2];
- ToTensorImpl<xpu>(ctx, inputs, outputs, req, length, channel);
+ const int channel = static_cast<int>(inputs[0].shape_[2]);
+ const int step = 0;
+ ToTensorImpl(inputs, outputs, req, length,
+ channel, normalize_factor, step);
} else if (inputs[0].ndim() == 4) {
// 4D input (n, h, w, c)
const int batch_size = inputs[0].shape_[0];
const int length = inputs[0].shape_[1] * inputs[0].shape_[2];
- const uint32_t channel = inputs[0].shape_[3];
+ const int channel = static_cast<int>(inputs[0].shape_[3]);
const int step = channel * length;
#pragma omp parallel for
for (auto n = 0; n < batch_size; ++n) {
- ToTensorImpl<xpu>(ctx, inputs, outputs, req, length, channel, n*step);
+ ToTensorImpl(inputs, outputs, req, length, channel,
+ normalize_factor, n*step);
}
}
}
diff --git a/src/operator/image/image_random.cu b/src/operator/image/image_random.cu
index 5f9aff2..6fe5383 100644
--- a/src/operator/image/image_random.cu
+++ b/src/operator/image/image_random.cu
@@ -21,6 +21,7 @@
* \file image_random.cu
* \brief GPU Implementation of image transformation operators
*/
+#include <cuda_runtime_api.h>
#include "./image_random-inl.h"
#include "../elemwise_op_common.h"
@@ -28,6 +29,88 @@ namespace mxnet {
namespace op {
namespace image {
+using namespace mshadow;
+
+// ToTensor Kernel for 3D input
+template<typename xpu, typename Dtype>
+__global__ void ToTensorCudaKernel(const Tensor<xpu, 3, Dtype> input,
+ const Tensor<xpu, 3, float> output,
+ const int req,
+ const int N,
+ const int H,
+ const int W,
+ const int C,
+ const float normalize_factor) {
+ // We process one image per thread block.
+ // In 3D case, we have only 1 block i.e., blockIdx.x
+ // We do not use it.
+ for (int c = 0; c < C; ++c) {
+ for (int h = threadIdx.y; h < H; h += blockDim.y) {
+ for (int w = threadIdx.x; w < W; w += blockDim.x) {
+ KERNEL_ASSIGN(output[c][h][w], req,
+ input[h][w][c] / normalize_factor);
+ }
+ }
+ }
+}
+
+// ToTensor Kernel for 4D input
+template<typename xpu, typename Dtype>
+__global__ void ToTensorCudaKernel(const Tensor<xpu, 4, Dtype> input,
+ const Tensor<xpu, 4, float> output,
+ const int req,
+ const int N,
+ const int H,
+ const int W,
+ const int C,
+ const float normalize_factor) {
+ // We process one image per thread block.
+ const int n = blockIdx.x;
+
+ for (int c = 0; c < C; ++c) {
+ for (int h = threadIdx.y; h < H; h += blockDim.y) {
+ for (int w = threadIdx.x; w < W; w += blockDim.x) {
+ KERNEL_ASSIGN(output[n][c][h][w], req,
+ input[n][h][w][c] / normalize_factor);
+ }
+ }
+ }
+}
+
+template<typename DType, typename T1, typename T2>
+void ToTensorImplCUDA(mshadow::Stream<gpu> *s,
+ const T1 input,
+ const T2 output,
+ const int req,
+ const float normalize_factor) {
+ int blocks, H, W, C, N;
+ cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+ if (std::is_same<T1, Tensor<gpu, 3, DType>>::value) {
+ // 3D Input - (H, W, C)
+ N = 0;
+ H = input.size(0);
+ W = input.size(1);
+ C = input.size(2);
+ blocks = 1;
+ } else {
+ // 4D Input - (N, H, W, C)
+ N = input.size(0);
+ H = input.size(1);
+ W = input.size(2);
+ C = input.size(3);
+ blocks = N > 0 ? N : 1;
+ blocks = N;
+ }
+ // One block per image.
+ // Number of threads = (32, 32) is optimal, because,
+ // computation is minimal and overhead of CUDA preparing
+ // all threads is minimal.
+ ToTensorCudaKernel<gpu, DType>
+ <<<blocks, dim3(32, 32), 0, stream>>>(input, output,
+ req, N, H, W, C, normalize_factor);
+ MSHADOW_CUDA_POST_KERNEL_CHECK(ToTensorCudaKernel);
+}
+
NNVM_REGISTER_OP(_image_to_tensor)
.set_attr<FCompute>("FCompute<gpu>", ToTensorOpForward<gpu>);