You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by di...@apache.org on 2020/10/05 20:21:10 UTC
[incubator-mxnet] branch master updated: Limit the number of
ElementWiseSum kernels compiled by RTC (#19266)
This is an automated email from the ASF dual-hosted git repository.
dickjc123 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 48a1fbf Limit the number of ElementWiseSum kernels compiled by RTC (#19266)
48a1fbf is described below
commit 48a1fbfbffd0ccb254b1f7dbb9d890d9351ffcdc
Author: Przemyslaw Tredak <pt...@nvidia.com>
AuthorDate: Mon Oct 5 13:19:48 2020 -0700
Limit the number of ElementWiseSum kernels compiled by RTC (#19266)
* Fix elemwise_sum
* Fixes from review
---
src/operator/tensor/elemwise_sum.cu | 47 ++++++++++++-------------------------
1 file changed, 15 insertions(+), 32 deletions(-)
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index acee34f..267477a 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -118,39 +118,22 @@ void VectorizedElementwiseSum(const nnvm::NodeAttrs &attrs,
: 1;
const index_t size = inputs[0].Size();
for (size_t i = 0; i < inputs.size(); i += num_inputs_per_kernel) {
- if (i == 0) {
- const std::string code = std::string("const OpReqType req = ") +
- util::to_string(req[0]) +
- ";\n";
- elementwise_sum_params params{};
- params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
- for (int j = 0; j < params.num_inputs; ++j) {
- params.inputs[j] = inputs[i + j].dptr_;
- }
- params.outputs[0] = outputs[0].dptr_;
- VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel",
- elementwise_sum_kernel, nvec,
- size, 1, s, params,
- inputs, outputs,
- ctx.run_ctx.get_ctx().dev_id);
- } else {
- /* During subsequent launches we need to
- accumulate into the previous outputs
- */
- const std::string code = "const OpReqType req = OpReqType::kAddTo;\n";
- elementwise_sum_params params{};
- params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
- for (int j = 0; j < params.num_inputs; ++j) {
- params.inputs[j] = inputs[i + j].dptr_;
- }
- params.outputs[0] = outputs[0].dptr_;
- const std::vector<TBlob> new_inputs(inputs.begin() + i, inputs.end());
- VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel",
- elementwise_sum_kernel, nvec,
- size, 1, s, params,
- new_inputs, outputs,
- ctx.run_ctx.get_ctx().dev_id);
+ const std::string code = std::string("const OpReqType req = ") +
+ util::to_string(i == 0 ? req[0] : kAddTo) +
+ ";\n";
+ elementwise_sum_params params{};
+ params.num_inputs = std::min(num_inputs_per_kernel, inputs.size() - i);
+ for (int j = 0; j < params.num_inputs; ++j) {
+ params.inputs[j] = inputs[i + j].dptr_;
}
+ params.outputs[0] = outputs[0].dptr_;
+ const std::vector<TBlob> new_inputs(inputs.begin() + i,
+ inputs.begin() + i + params.num_inputs);
+ VectorizedKernelRTCLauncher(code, "elementwise_sum_kernel",
+ elementwise_sum_kernel, nvec,
+ size, 1, s, params,
+ new_inputs, outputs,
+ ctx.run_ctx.get_ctx().dev_id);
}
}