You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2020/04/29 16:34:12 UTC

[GitHub] [incubator-mxnet] apeforest commented on a change in pull request #18168: Separate GPU kernel for broadcast_axis

apeforest commented on a change in pull request #18168:
URL: https://github.com/apache/incubator-mxnet/pull/18168#discussion_r417452346



##########
File path: src/operator/tensor/broadcast_reduce_op.h
##########
@@ -1077,6 +1077,42 @@ struct broadcast_kernel {
   }
 };
 
+namespace {
+struct shape_and_stride {
+  index_t in_stride[MXNET_SPECIAL_MAX_NDIM];
+  index_t out_stride[MXNET_SPECIAL_MAX_NDIM];
+  index_t input_shape[MXNET_SPECIAL_MAX_NDIM];
+  index_t output_shape[MXNET_SPECIAL_MAX_NDIM];
+};
+}
+
+template<typename OP>
+struct broadcast_kernel_gpu {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  IType *input,
+                                  OType *output,
+                                  const shape_and_stride& aux_data,
+                                  const OpReqType req,
+                                  const int ndim) {
+    index_t idx = i;
+    index_t in_idx = i;
+#pragma unroll 4
+    for (index_t iter = ndim - 1; iter >= 0; --iter) {
+      index_t out_dim_shape = aux_data.output_shape[iter];
+      index_t out_dim_stride = aux_data.out_stride[iter];
+      index_t dim_idx = idx - (idx / out_dim_shape) * out_dim_shape;

Review comment:
       Does this really improve performance compared to using `%`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org