You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2020/04/29 03:30:42 UTC

[GitHub] [incubator-mxnet] leezu commented on a change in pull request #18168: [WIP]separate GPU kernel for broadcast_axis

leezu commented on a change in pull request #18168:
URL: https://github.com/apache/incubator-mxnet/pull/18168#discussion_r417047760



##########
File path: src/operator/tensor/broadcast_reduce_op.h
##########
@@ -1077,6 +1077,41 @@ struct broadcast_kernel {
   }
 };
 
+namespace
+{
+  struct shape_and_stride {
+    int32_t in_stride[MXNET_SPECIAL_MAX_NDIM];
+    int32_t out_stride[MXNET_SPECIAL_MAX_NDIM];
+    int32_t input_shape[MXNET_SPECIAL_MAX_NDIM];
+    int32_t output_shape[MXNET_SPECIAL_MAX_NDIM];
+  };
+}
+
+template<typename OP>
+struct broadcast_kernel_gpu {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int32_t i,
+                                  IType *input,
+                                  OType *output,
+                                  const struct shape_and_stride aux_data,
+                                  const OpReqType req,
+                                  const int32_t ndim) {
+    int32_t idx = i;
+    int32_t in_idx = i;
+#pragma unroll 4

Review comment:
       Did you benchmark if unroll makes a measurable difference? Background: currently libmxnet.so is quite large, and we may want to remove loop unrolling and force inline to reduce the size




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org