You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by di...@apache.org on 2020/10/05 20:21:10 UTC

[incubator-mxnet] branch master updated: Limit the number of ElementWiseSum kernels compiled by RTC (#19266)

This is an automated email from the ASF dual-hosted git repository.

dickjc123 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 48a1fbf  Limit the number of ElementWiseSum kernels compiled by RTC (#19266)
48a1fbf is described below

commit 48a1fbfbffd0ccb254b1f7dbb9d890d9351ffcdc
Author: Przemyslaw Tredak <pt...@nvidia.com>
AuthorDate: Mon Oct 5 13:19:48 2020 -0700

    Limit the number of ElementWiseSum kernels compiled by RTC (#19266)
    
    * Fix elemwise_sum
    
    * Fixes from review
---
 src/operator/tensor/elemwise_sum.cu | 47 ++++++++++++-------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index acee34f..267477a 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -118,39 +118,22 @@ void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
                      : 1;
   const index_t size = inputs[0].Size();
   for (size_t i = 0; i < inputs.size(); i += num_inputs_per_kernel) {
-    if (i == 0) {
-      const std::string code = std::string("const OpReqType req = ") +
-                               util::to_string(req[0]) +
-                               ";\n";
-      elementwise_sum_params params{};
-      params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
-      for (int j = 0; j < params.num_inputs; ++j) {
-        params.inputs[j] = inputs[i + j].dptr_;
-      }
-      params.outputs[0] = outputs[0].dptr_;
-      VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel",
-                                  elementwise_sum_kernel, nvec,
-                                  size, 1, s, params,
-                                  inputs, outputs,
-                                  ctx.run_ctx.get_ctx().dev_id);
-    } else {
-      /* During subsequent launches we need to
-         accumulate into the previous outputs
-      */
-      const std::string code = "const OpReqType req = OpReqType::kAddTo;\n";
-      elementwise_sum_params params{};
-      params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
-      for (int j = 0; j < params.num_inputs; ++j) {
-        params.inputs[j] = inputs[i + j].dptr_;
-      }
-      params.outputs[0] = outputs[0].dptr_;
-      const std::vector<TBlob> new_inputs(inputs.begin() + i, inputs.end());
-      VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel",
-                                  elementwise_sum_kernel, nvec,
-                                  size, 1, s, params,
-                                  new_inputs, outputs,
-                                  ctx.run_ctx.get_ctx().dev_id);
+    const std::string code = std::string("const OpReqType req = ") +
+                             util::to_string(i == 0 ? req[0] : kAddTo) +
+                             ";\n";
+    elementwise_sum_params params{};
+    params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
+    for (int j = 0; j < params.num_inputs; ++j) {
+      params.inputs[j] = inputs[i + j].dptr_;
     }
+    params.outputs[0] = outputs[0].dptr_;
+    const std::vector<TBlob> new_inputs(inputs.begin() + i,
+                                        inputs.begin() + i + params.num_inputs);
+    VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel",
+                                elementwise_sum_kernel, nvec,
+                                size, 1, s, params,
+                                new_inputs, outputs,
+                                ctx.run_ctx.get_ctx().dev_id);
   }
 }