You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by pa...@apache.org on 2019/05/22 23:17:58 UTC
[incubator-mxnet] branch master updated: Integrating the MKL VML
functions to MXNET to speed-up the (element-wised) mathematic computation
(#14893)
This is an automated email from the ASF dual-hosted git repository.
patriczhao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new b0be6c5 Integrating the MKL VML functions to MXNET to speed-up the (element-wised) mathematic computation (#14893)
b0be6c5 is described below
commit b0be6c50ae686a59c340c56b9c11493a5c84ff72
Author: Shufan <33...@users.noreply.github.com>
AuthorDate: Thu May 23 07:17:38 2019 +0800
Integrating the MKL VML functions to MXNET to speed-up the (element-wised) mathematic computation (#14893)
* mkl_func test with erf&log op, build success~
* fix lint and build issues
* Try to add support to sparse array
* fix build
* add functions
* Fix review comments
* remove unecessary code
* Update test case
* minor fix
* move the position of MKL_Compute
* mkl_func test with erf&log op, build success~
* fix lint and build issues
* Try to add support to sparse array
* fix build
* Fix review comments
* remove unecessary code
* Update test case
* minor fix
* add functions
* move the position of MKL_Compute
* fix cpplint
* cpp lint
* trigger ci
* address comments
* coding style
* enable layernorm
* fix windows build
* revert changes to FComputeEx
* int -> index_t
* remove workspace
* fix lint
* clean code
---
src/operator/mkl_functions-inl.h | 165 +++++++++++++++++++++++++
src/operator/nn/layer_norm-inl.h | 9 +-
src/operator/nn/layer_norm.cc | 58 ++++++++-
src/operator/tensor/elemwise_unary_op.h | 118 ++++++++++++------
src/operator/tensor/elemwise_unary_op_basic.cc | 50 +++++++-
tests/python/gpu/test_operator_gpu.py | 48 +++++++
6 files changed, 398 insertions(+), 50 deletions(-)
diff --git a/src/operator/mkl_functions-inl.h b/src/operator/mkl_functions-inl.h
new file mode 100644
index 0000000..6080347
--- /dev/null
+++ b/src/operator/mkl_functions-inl.h
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkl_functions-inl.h
+ * \brief Wrapper for MKL VML functions
+ * \author Tao Lv, Shufan Wu
+*/
+#ifndef MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+#define MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
+
+#if MSHADOW_USE_MKL == 1
+#include "mkl_vml.h"
+
+namespace mxnet {
+namespace op {
+namespace mkl_func {
+
+MSHADOW_XINLINE
+static bool check_size(const size_t n) {
+ const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
+ return (n <= MKL_INT_MAX);
+}
+
+MSHADOW_XINLINE
+static bool check_type(const int t) {
+ return (t == mshadow::kFloat32 || t == mshadow::kFloat64);
+}
+
+#define MXNET_MKL_UNARY_MATH_FUNC(name, func) \
+struct name { \
+ MSHADOW_XINLINE static void Vectorize(const index_t n, const float *src, float *dst) { \
+ vs##func(static_cast<MKL_INT>(n), src, dst); \
+ } \
+ MSHADOW_XINLINE static void Vectorize(const index_t n, const double *src, double *dst) { \
+ vd##func(static_cast<MKL_INT>(n), src, dst); \
+ } \
+};
+
+#define MXNET_MKL_BINARY_MATH_FUNC(name, func) \
+struct name { \
+ MSHADOW_XINLINE static void Vectorize(const index_t n, \
+ const float *a, \
+ const float *b, \
+ float *c) { \
+ vs##func(static_cast<MKL_INT>(n), a, b, c); \
+ } \
+ MSHADOW_XINLINE static void Vectorize(const index_t n, \
+ const double *a, \
+ const double *b, \
+ double *c) { \
+ vd##func(static_cast<MKL_INT>(n), a, b, c); \
+ } \
+};
+
+MXNET_MKL_UNARY_MATH_FUNC(erf, Erf);
+MXNET_MKL_UNARY_MATH_FUNC(exp, Exp);
+MXNET_MKL_UNARY_MATH_FUNC(exp2, Exp2);
+MXNET_MKL_UNARY_MATH_FUNC(exp10, Exp10);
+MXNET_MKL_UNARY_MATH_FUNC(expm1, Expm1);
+MXNET_MKL_UNARY_MATH_FUNC(log, Ln);
+MXNET_MKL_UNARY_MATH_FUNC(log2, Log2);
+MXNET_MKL_UNARY_MATH_FUNC(log10, Log10);
+MXNET_MKL_UNARY_MATH_FUNC(log1p, Log1p);
+
+MXNET_MKL_UNARY_MATH_FUNC(sin, Sin);
+MXNET_MKL_UNARY_MATH_FUNC(cos, Cos);
+MXNET_MKL_UNARY_MATH_FUNC(tan, Tan);
+MXNET_MKL_UNARY_MATH_FUNC(asin, Asin);
+MXNET_MKL_UNARY_MATH_FUNC(acos, Acos);
+MXNET_MKL_UNARY_MATH_FUNC(atan, Atan);
+
+MXNET_MKL_UNARY_MATH_FUNC(sinh, Sinh);
+MXNET_MKL_UNARY_MATH_FUNC(cosh, Cosh);
+MXNET_MKL_UNARY_MATH_FUNC(tanh, Tanh);
+MXNET_MKL_UNARY_MATH_FUNC(asinh, Asinh);
+MXNET_MKL_UNARY_MATH_FUNC(acosh, Acosh);
+MXNET_MKL_UNARY_MATH_FUNC(atanh, Atanh);
+
+MXNET_MKL_UNARY_MATH_FUNC(sqrt, Sqrt);
+MXNET_MKL_UNARY_MATH_FUNC(abs, Abs);
+MXNET_MKL_UNARY_MATH_FUNC(cbrt, Cbrt);
+MXNET_MKL_UNARY_MATH_FUNC(round, Round);
+MXNET_MKL_UNARY_MATH_FUNC(ceil, Ceil);
+MXNET_MKL_UNARY_MATH_FUNC(floor, Floor);
+MXNET_MKL_UNARY_MATH_FUNC(trunc, Trunc);
+
+MXNET_MKL_UNARY_MATH_FUNC(lgamma, LGamma);
+MXNET_MKL_UNARY_MATH_FUNC(tgamma, TGamma);
+MXNET_MKL_UNARY_MATH_FUNC(square, Sqr);
+
+MXNET_MKL_BINARY_MATH_FUNC(add, Add);
+MXNET_MKL_BINARY_MATH_FUNC(sub, Sub);
+MXNET_MKL_BINARY_MATH_FUNC(mul, Mul);
+MXNET_MKL_BINARY_MATH_FUNC(pow, Pow);
+MXNET_MKL_BINARY_MATH_FUNC(hypot, Hypot);
+
+template <typename DType>
+MSHADOW_XINLINE static void sum_(index_t n, DType *in, DType *dst) {
+ DType sum = 0.0f;
+ for (index_t i = 0; i < n; i++)
+ sum += in[i];
+
+ dst[0] = sum;
+}
+
+// LayerNorm on the last dimension
+template <typename DType>
+MSHADOW_XINLINE static void LayerNormLastDim(index_t m,
+ index_t n,
+ DType *a,
+ DType *b,
+ DType *gamma,
+ DType *beta,
+ DType *mean,
+ DType *var,
+ DType eps) {
+ auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(nthreads)
+ for (index_t i = 0; i < m; i++) {
+ DType* in_offset = a + i * n;
+ DType* out_offset = b + i * n;
+
+ sum_(n, in_offset, &(mean[i]));
+ mean[i] /= n;
+ var[i] = 0.0f;
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+ for (index_t j = 0; j < n; j++) {
+ out_offset[j] = in_offset[j] - mean[i];
+ var[i] += out_offset[j] * out_offset[j];
+ }
+ var[i] = math::sqrt(var[i] / n + eps);
+#if !defined(_MSC_VER)
+#pragma omp simd
+#endif
+ for (index_t j = 0; j < n; j++) {
+ out_offset[j] = out_offset[j] * gamma[j] / var[i] + beta[j];
+ }
+ }
+}
+
+} // namespace mkl_func
+} // namespace op
+} // namespace mxnet
+#endif // MSHADOW_USE_MKL == 1
+#endif // MXNET_OPERATOR_MKL_FUNCTIONS_INL_H_
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index 456a5cb..7636c9b 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -63,6 +63,10 @@ struct LayerNormParam : public dmlc::Parameter<LayerNormParam> {
}
};
+static int GetRealAxis(int axis, int ndim) {
+ return axis < 0 ? (axis + ndim) : axis;
+}
+
template<typename xpu>
void LayerNormCompute(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
@@ -79,10 +83,7 @@ void LayerNormComputeGeneral(const nnvm::NodeAttrs& attrs,
const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
if (req[0] == kNullOp) return;
CHECK_NE(req[0], kAddTo);
- int axis = param.axis;
- if (axis < 0) {
- axis += static_cast<int>(inputs[0].ndim());
- }
+ int axis = GetRealAxis(param.axis, inputs[0].ndim());
CHECK(axis >= 0 && axis < inputs[0].ndim()) << "Channel axis out of range: " << param.axis;
CHECK_EQ(inputs.size(), 3U);
Stream<xpu> *s = ctx.get_stream<xpu>();
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index 5b0aca6..1581f1a 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -27,6 +27,10 @@
#include <nnvm/op_attr_types.h>
#include "../elemwise_op_common.h"
+#if MSHADOW_USE_MKL == 1
+#include "../mkl_functions-inl.h"
+#endif
+
namespace mxnet {
namespace op {
@@ -39,10 +43,7 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
using namespace mshadow;
CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
const mxnet::TShape &dshape = in_shape->at(layernorm::kData);
- int axis = param.axis;
- if (axis < 0) {
- axis += dshape.ndim();
- }
+ int axis = GetRealAxis(param.axis, dshape.ndim());
CHECK(axis >= 0 && axis < dshape.ndim())
<< "Channel axis out of range: axis=" << param.axis;
@@ -64,7 +65,6 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
return true;
}
-
template<>
void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
@@ -73,6 +73,50 @@ void LayerNormCompute<cpu>(const nnvm::NodeAttrs& attrs,
return LayerNormComputeGeneral<cpu>(attrs, ctx, inputs, req, outputs);
}
+#if MSHADOW_USE_MKL == 1
+void LayerNormComputeMKL(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+ using namespace mshadow;
+ const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
+ if (req[0] == kNullOp) return;
+ CHECK_NE(req[0], kAddTo);
+ CHECK_EQ(inputs.size(), 3U);
+ int axis = GetRealAxis(param.axis, inputs[0].ndim());
+
+ if (axis == (inputs[layernorm::kData].ndim() - 1) ||
+ (inputs[0].type_flag_ != kFloat32 && inputs[0].type_flag_ != kFloat64)) {
+ // Compute necessary data for the reduce operation.
+ mxnet::TShape red_src_shape, red_dst_shape;
+ BroadcastReduceShapeCompact(inputs[layernorm::kData].shape_, outputs[layernorm::kMean].shape_,
+ &red_src_shape, &red_dst_shape);
+ const TBlob in_data = inputs[layernorm::kData].reshape(red_src_shape);
+ const TBlob mean_data = outputs[layernorm::kMean].reshape(red_dst_shape);
+ const TBlob std_data = outputs[layernorm::kStd].reshape(red_dst_shape);
+ const int outter_size = red_dst_shape.Size();
+ const int channel_size = red_src_shape.Size() / red_dst_shape.Size();
+
+ // call
+ MSHADOW_SGL_DBL_TYPE_SWITCH(in_data.type_flag_, DType, {
+ mkl_func::LayerNormLastDim(outter_size, channel_size,
+ in_data.dptr<DType>(),
+ outputs[layernorm::kOut].dptr<DType>(),
+ inputs[layernorm::kGamma].dptr<DType>(),
+ inputs[layernorm::kBeta].dptr<DType>(),
+ outputs[layernorm::kMean].dptr<DType>(),
+ outputs[layernorm::kStd].dptr<DType>(),
+ static_cast<DType>(param.eps));
+ });
+ } else {
+ // fallback
+ LayerNormCompute<cpu>(attrs, ctx, inputs, req, outputs);
+ }
+}
+#endif
+
+
template<>
void LayerNormGradCompute<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
@@ -126,7 +170,11 @@ axis to be the last item in the input shape.
})
.set_attr<mxnet::FInferShape>("FInferShape", LayerNormShape)
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 3>)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", LayerNormComputeMKL)
+#else
.set_attr<FCompute>("FCompute<cpu>", LayerNormCompute<cpu>)
+#endif
.set_attr<nnvm::FGradient>("FGradient", [](const nnvm::NodePtr& n,
const std::vector<nnvm::NodeEntry>& ograds) {
std::vector<nnvm::NodeEntry> heads;
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 86e8b01..458106e 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -35,9 +35,10 @@
#include "../mxnet_op.h"
#include "../elemwise_op_common.h"
#include "../../ndarray/ndarray_function.h"
+
#if MSHADOW_USE_MKL == 1
-#include "mkl.h"
-#endif
+#include "../mkl_functions-inl.h"
+#endif // MSHADOW_USE_MKL == 1
namespace mxnet {
namespace op {
@@ -264,6 +265,48 @@ class UnaryOp : public OpBase {
}
}
+#if MSHADOW_USE_MKL == 1
+ template<typename OP, typename MKL_OP>
+ static void MKL_Compute(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
+ if (req[0] == kNullOp) return;
+ auto type_flag = inputs[0].type_flag_;
+ size_t input_size = inputs[0].Size();
+ if ((req[0] == kWriteTo || req[0] == kWriteInplace) &&
+ mkl_func::check_size(input_size) &&
+ mkl_func::check_type(type_flag)) {
+ // set DType as float or double according to type_flag
+ MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+ MKL_OP::Vectorize(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+ });
+ } else {
+ Compute<cpu, OP>(attrs, ctx, inputs, req, outputs);
+ }
+ }
+
+ template<typename OP, typename MKL_OP>
+ static void MKL_ComputeEx(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
+ CHECK_EQ(inputs.size(), 1U)
+ << "Invalid input, only one input is allowed";
+ CHECK_EQ(outputs.size(), 1U)
+ << "Invalid output, only one output is allowed";
+ CHECK_NE(inputs[0].storage_type(), kDefaultStorage)
+ << "Operation requires a sparse output storage type";
+ CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+ << "Operation requires a sparse output storage type";
+ if (inputs[0].storage_shape().Size()) {
+ MapToFCompute<cpu>(attrs, ctx, inputs, req, outputs, MKL_Compute<OP, MKL_OP>);
+ }
+ }
+#endif
+
template<typename xpu, typename op>
static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
const OpContext &ctx,
@@ -352,43 +395,6 @@ class UnaryOp : public OpBase {
LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
}
}
-
-#if MSHADOW_USE_MKL == 1
- static inline void MKLLog(MKL_INT size, const float* pIn, float* pOut) {
- vsLn(size, pIn, pOut);
- }
-
- static inline void MKLLog(MKL_INT size, const double* pIn, double* pOut) {
- vdLn(size, pIn, pOut);
- }
-#endif
-
- template<typename xpu, typename OP>
- static void LogCompute(const nnvm::NodeAttrs& attrs,
- const OpContext& ctx,
- const std::vector<TBlob>& inputs,
- const std::vector<OpReqType>& req,
- const std::vector<TBlob>& outputs) {
- if (req[0] == kNullOp) return;
- // if defined MSHADOW_USE_MKL then call mkl log when req is KWriteTo, type_flag
- // is mshadow::kFloat32 or mshadow::kFloat64 and data size less than or equal MKL_INT_MAX
-#if MSHADOW_USE_MKL == 1
- auto type_flag = inputs[0].type_flag_;
- const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
- size_t input_size = inputs[0].Size();
- if (req[0] == kWriteTo &&
- input_size <= MKL_INT_MAX &&
- (type_flag == mshadow::kFloat32 || type_flag == mshadow::kFloat64)) {
- MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
- MKLLog(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
- });
- } else {
- Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
- }
-#else
- Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-#endif
- }
};
/*! \brief Map legacy unary_bwd to backward_grad */
@@ -557,7 +563,7 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
NNVM_REGISTER_OP(__name$) \
.set_num_inputs(1) \
.set_num_outputs(1) \
- .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
+ .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>) \
.set_attr<nnvm::FInplaceOption>("FInplaceOption", \
[](const NodeAttrs& attrs){ \
@@ -565,6 +571,38 @@ struct ReshapeLikeParam : public dmlc::Parameter<ReshapeLikeParam> {
}) \
.add_argument("data", "NDArray-or-Symbol", "The input array.")
+#if MSHADOW_USE_MKL == 1
+ /*! \bried MKL Unary compute.
+ * With this macro means mxnet compile with MKL to accelerate math function with mkl.
+ * Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+ */
+ #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, \
+ __kernel$, __mkl_kernel$) \
+ MXNET_OPERATOR_REGISTER_UNARY(__name$) \
+ MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
+ .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, \
+ false, true, true>) \
+ .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>) \
+ .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::ComputeEx<__xpu$, __kernel$>)
+
+ /*! \bried MKL Unary compute.
+ * With this macro means mxnet compile with MKL to accelerate math function with mkl.
+ * Will Register FCompute with UnaryOp::MKL_Compute() to compelet the math function.
+ */
+ #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$, __mkl_kernel$) \
+ MXNET_OPERATOR_REGISTER_UNARY(__name$) \
+ MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
+ .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, \
+ false, true, false>) \
+ .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>) \
+ .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::MKL_ComputeEx<__xpu$, __kernel$>)
+
+ #define MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$, \
+ __mkl_kernel$) \
+ MXNET_OPERATOR_REGISTER_UNARY(__name$) \
+ .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::MKL_Compute<__kernel$, __mkl_kernel$>)
+#endif
+
/*! \brief Unary compute, with FComputeEx for csr and rsp available */
#define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$) \
MXNET_OPERATOR_REGISTER_UNARY(__name$) \
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 1634606..7722506 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -829,6 +829,26 @@ The storage type of ``fix`` output depends upon the input storage type:
.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
// square
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square, mkl_func::square)
+.describe(R"code(Returns element-wise squared value of the input.
+
+.. math::
+ square(x) = x^2
+
+Example::
+
+ square([2, 3, 4]) = [4, 9, 16]
+
+The storage type of ``square`` output depends upon the input storage type:
+
+ - square(default) = default
+ - square(row_sparse) = row_sparse
+ - square(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+#else
MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square)
.describe(R"code(Returns element-wise squared value of the input.
@@ -847,6 +867,7 @@ The storage type of ``square`` output depends upon the input storage type:
)code" ADD_FILELINE)
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+#endif
MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_square,
unary_bwd<mshadow_op::square_grad>);
@@ -926,9 +947,14 @@ Example::
erf([0, -1., 10.]) = [0., -0.8427, 1.]
)code" ADD_FILELINE)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::erf, mkl_func::erf>)
+#else
.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::erf>)
+#endif // MSHADOW_USE_MKL == 1
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erf"});
+
MXNET_OPERATOR_REGISTER_BINARY(_backward_erf)
.set_attr<FCompute>("FCompute<cpu>",
ElemwiseBinaryOp::Compute<cpu, unary_bwd<mshadow_op::erf_grad>>);
@@ -970,6 +996,23 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_rcbrt)
unary_bwd<mshadow_op::reciprocal_cube_root_grad>>);
// exp
+#if MSHADOW_USE_MKL == 1
+MXNET_MKL_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp, mkl_func::exp)
+MXNET_ADD_SPARSE_OP_ALIAS(exp)
+.describe(R"code(Returns element-wise exponential value of the input.
+
+.. math::
+ exp(x) = e^x \approx 2.718^x
+
+Example::
+
+ exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
+
+The storage type of ``exp`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+#else
MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp)
MXNET_ADD_SPARSE_OP_ALIAS(exp)
.describe(R"code(Returns element-wise exponential value of the input.
@@ -985,6 +1028,7 @@ The storage type of ``exp`` output is always dense
)code" ADD_FILELINE)
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+#endif
// log
MXNET_OPERATOR_REGISTER_UNARY(log)
@@ -996,7 +1040,11 @@ The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
The storage type of ``log`` output is always dense
)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryOp::LogCompute<cpu, mshadow_op::log>)
+#if MSHADOW_USE_MKL == 1
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::MKL_Compute<mshadow_op::log, mkl_func::log>)
+#else
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::log>)
+#endif // MSHADOW_USE_MKL == 1
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
// log10
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 2a1583e..710686d 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2201,6 +2201,54 @@ def test_context_num_gpus():
# Test that num_gpus reports at least one GPU, as the test is run on a GPU host.
assert mx.context.num_gpus() > 0
+def math_log(shape, dtype, check_value):
+ np_x = np.random.rand(*tuple(shape))
+ x = mx.nd.array(np_x, dtype=dtype)
+ y = mx.nd.log(data=x)
+ if check_value:
+ x_ = x.as_in_context(mx.cpu())
+ y_ = mx.nd.log(data=x_)
+ assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def math_erf(shape, dtype, check_value):
+ np_x = np.random.rand(*tuple(shape))
+ x = mx.nd.array(np_x, dtype=dtype)
+ y = mx.nd.erf(data=x)
+ if check_value:
+ x_ = x.as_in_context(mx.cpu())
+ y_ = mx.nd.erf(data=x_)
+ assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def math_square(shape, dtype, check_value):
+ np_x = np.random.rand(*tuple(shape))
+ x = mx.nd.array(np_x, dtype=dtype)
+ y = mx.nd.square(data=x)
+ if check_value:
+ x_ = x.as_in_context(mx.cpu())
+ y_ = mx.nd.square(data=x_)
+ assert_almost_equal(y.asnumpy(), y_.asnumpy())
+
+def run_math(op, shape, dtype="float32", check_value=True):
+ run_num = 10
+ for i in range(run_num):
+ if op == 'log':
+ math_log(shape=shape, dtype=dtype, check_value=check_value)
+ elif op == 'erf':
+ math_erf(shape=shape, dtype=dtype, check_value=check_value)
+ elif op == 'square':
+ math_square(shape=shape, dtype=dtype, check_value=check_value)
+
+@with_seed()
+def test_math():
+ ops = ['log', 'erf', 'square']
+ check_value= True
+ shape_lst = [[1000], [100,1000], [10,100,100], [10,100,100,100]]
+ dtypes = ["float32", "float64"]
+ for shape in shape_lst:
+ for dtype in dtypes:
+ for op in ops:
+ run_math(op, shape, dtype, check_value=check_value)
+
if __name__ == '__main__':
import nose
nose.runmodule()