You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@mxnet.apache.org by ta...@apache.org on 2019/10/14 15:40:30 UTC

[incubator-mxnet] branch mkldnn-v1.0 updated (9f77575 -> 43e35a9)

This is an automated email from the ASF dual-hosted git repository.

taolv pushed a change to branch mkldnn-v1.0
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git.


    from 9f77575  [mkldnn-v1.0] Update enabling flag for MKL dropout (#16433)
     add d8193c6  Update add_op_in_backend.md (#16403)
     add 7f5e687  numpy-compatible histogram (#16266)
     add ca30ba8  Pseudo 2D transpose kernel (#16229)
     add d2d76dc  increase docker cache timeout (#16430)
     add 4dee4ee  Fix mkldnn reshape (#16455)
     add 1e8cc90  [BUGFIX] Minor type issues in Squeeze (#16448)
     add 858a52e  Fix large array tests (#16328)
     add 6d6e46b  Comparison ops implemented using mshadow (#16414)
     new 43e35a9  Merge remote-tracking branch 'origin/master' into mkldnn-v1.0

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CMakeLists.txt                                     |   6 +-
 ci/docker_cache.py                                 |   2 +-
 .../src/pages/api/faq/add_op_in_backend.md         |   2 +-
 python/mxnet/_numpy_op_doc.py                      |  51 +++
 python/mxnet/ndarray/numpy/_op.py                  |  53 +++-
 python/mxnet/numpy/multiarray.py                   |  53 +++-
 python/mxnet/numpy_extension/__init__.py           |   2 +-
 python/mxnet/symbol/numpy/_symbol.py               |  85 ++++-
 python/mxnet/test_utils.py                         |  33 +-
 python/mxnet/util.py                               |  61 ++++
 src/common/utils.h                                 |  15 +
 src/ndarray/ndarray_function.cc                    |  13 +-
 src/ndarray/ndarray_function.cu                    |   4 -
 src/operator/contrib/index_copy-inl.h              |   2 +-
 src/operator/contrib/index_copy.cc                 |   4 +-
 src/operator/leaky_relu-inl.h                      |   2 +-
 src/operator/mshadow_op.h                          |  30 ++
 src/operator/mxnet_op.h                            |  20 ++
 src/operator/nn/dropout-inl.h                      |   4 +-
 src/operator/nn/mkldnn/mkldnn_base-inl.h           |   1 -
 src/operator/nn/mkldnn/mkldnn_base.cc              |   6 +-
 src/operator/nn/mkldnn/mkldnn_expand_dims.cc       |  70 -----
 src/operator/nn/mkldnn/mkldnn_flatten-inl.h        |  48 ---
 src/operator/nn/mkldnn/mkldnn_flatten.cc           |  79 -----
 src/operator/nn/mkldnn/mkldnn_ops-inl.h            |  29 +-
 src/operator/nn/mkldnn/mkldnn_reshape-inl.h        | 152 ++++-----
 src/operator/nn/mkldnn/mkldnn_reshape.cc           |  95 +++---
 .../numpy/np_elemwise_broadcast_logic_op.cc        | 301 ++++++++++++++++++
 .../numpy/np_elemwise_broadcast_logic_op.cu        |  60 ++++
 src/operator/numpy/np_elemwise_broadcast_op.cc     | 223 -------------
 src/operator/numpy/np_elemwise_unary_op_basic.cc   |  65 ++--
 src/operator/numpy/np_elemwise_unary_op_basic.cu   |   9 +-
 src/operator/numpy/np_matrix_op-inl.h              |   4 +-
 src/operator/numpy/np_matrix_op.cc                 |  57 +++-
 src/operator/operator_tune.cc                      |  10 +
 .../mkldnn/mkldnn_quantized_flatten.cc             |   4 +-
 src/operator/tensor/elemwise_binary_broadcast_op.h |  42 ++-
 src/operator/tensor/elemwise_binary_op.h           |  26 ++
 src/operator/tensor/elemwise_binary_scalar_op.h    |  20 ++
 src/operator/tensor/elemwise_unary_op.h            |  21 +-
 src/operator/tensor/elemwise_unary_op_basic.cc     |   1 +
 src/operator/tensor/histogram.cc                   |   1 +
 src/operator/tensor/matrix_op-inl.h                |  17 +-
 src/operator/tensor/matrix_op.cc                   | 207 ++----------
 src/operator/tensor/pseudo2DTranspose_op-inl.cuh   | 348 +++++++++++++++++++++
 tests/nightly/test_large_array.py                  | 147 ++++-----
 tests/nightly/test_large_vector.py                 |  20 +-
 tests/python/unittest/common.py                    |  21 ++
 tests/python/unittest/test_numpy_ndarray.py        |  33 +-
 tests/python/unittest/test_numpy_op.py             |  45 ++-
 tests/python/unittest/test_operator.py             |  39 +++
 51 files changed, 1667 insertions(+), 976 deletions(-)
 delete mode 100644 src/operator/nn/mkldnn/mkldnn_expand_dims.cc
 delete mode 100644 src/operator/nn/mkldnn/mkldnn_flatten-inl.h
 delete mode 100644 src/operator/nn/mkldnn/mkldnn_flatten.cc
 create mode 100644 src/operator/numpy/np_elemwise_broadcast_logic_op.cc
 create mode 100644 src/operator/numpy/np_elemwise_broadcast_logic_op.cu
 create mode 100644 src/operator/tensor/pseudo2DTranspose_op-inl.cuh

[incubator-mxnet] 01/01: Merge remote-tracking branch 'origin/master' into mkldnn-v1.0

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

taolv pushed a commit to branch mkldnn-v1.0
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git

commit 43e35a9d3a17d5fd6c6757d9862489006d03f817
Merge: 9f77575 6d6e46b
Author: Tao Lv <ta...@intel.com>
AuthorDate: Tue Oct 15 08:07:33 2019 +0800

    Merge remote-tracking branch 'origin/master' into mkldnn-v1.0
    
    Conflicts:
    	src/operator/nn/mkldnn/mkldnn_base-inl.h
    	src/operator/nn/mkldnn/mkldnn_flatten-inl.h
    	src/operator/nn/mkldnn/mkldnn_flatten.cc
    	src/operator/nn/mkldnn/mkldnn_ops-inl.h
    	src/operator/nn/mkldnn/mkldnn_reshape-inl.h
    	src/operator/nn/mkldnn/mkldnn_reshape.cc
    	src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc
    	src/operator/tensor/matrix_op.cc

 CMakeLists.txt                                     |   6 +-
 ci/docker_cache.py                                 |   2 +-
 .../src/pages/api/faq/add_op_in_backend.md         |   2 +-
 python/mxnet/_numpy_op_doc.py                      |  51 +++
 python/mxnet/ndarray/numpy/_op.py                  |  53 +++-
 python/mxnet/numpy/multiarray.py                   |  53 +++-
 python/mxnet/numpy_extension/__init__.py           |   2 +-
 python/mxnet/symbol/numpy/_symbol.py               |  85 ++++-
 python/mxnet/test_utils.py                         |  33 +-
 python/mxnet/util.py                               |  61 ++++
 src/common/utils.h                                 |  15 +
 src/ndarray/ndarray_function.cc                    |  13 +-
 src/ndarray/ndarray_function.cu                    |   4 -
 src/operator/contrib/index_copy-inl.h              |   2 +-
 src/operator/contrib/index_copy.cc                 |   4 +-
 src/operator/leaky_relu-inl.h                      |   2 +-
 src/operator/mshadow_op.h                          |  30 ++
 src/operator/mxnet_op.h                            |  20 ++
 src/operator/nn/dropout-inl.h                      |   4 +-
 src/operator/nn/mkldnn/mkldnn_base-inl.h           |   1 -
 src/operator/nn/mkldnn/mkldnn_base.cc              |   6 +-
 src/operator/nn/mkldnn/mkldnn_expand_dims.cc       |  70 -----
 src/operator/nn/mkldnn/mkldnn_flatten-inl.h        |  48 ---
 src/operator/nn/mkldnn/mkldnn_flatten.cc           |  79 -----
 src/operator/nn/mkldnn/mkldnn_ops-inl.h            |  29 +-
 src/operator/nn/mkldnn/mkldnn_reshape-inl.h        | 152 ++++-----
 src/operator/nn/mkldnn/mkldnn_reshape.cc           |  95 +++---
 .../numpy/np_elemwise_broadcast_logic_op.cc        | 301 ++++++++++++++++++
 .../numpy/np_elemwise_broadcast_logic_op.cu        |  60 ++++
 src/operator/numpy/np_elemwise_broadcast_op.cc     | 223 -------------
 src/operator/numpy/np_elemwise_unary_op_basic.cc   |  65 ++--
 src/operator/numpy/np_elemwise_unary_op_basic.cu   |   9 +-
 src/operator/numpy/np_matrix_op-inl.h              |   4 +-
 src/operator/numpy/np_matrix_op.cc                 |  57 +++-
 src/operator/operator_tune.cc                      |  10 +
 .../mkldnn/mkldnn_quantized_flatten.cc             |   4 +-
 src/operator/tensor/elemwise_binary_broadcast_op.h |  42 ++-
 src/operator/tensor/elemwise_binary_op.h           |  26 ++
 src/operator/tensor/elemwise_binary_scalar_op.h    |  20 ++
 src/operator/tensor/elemwise_unary_op.h            |  21 +-
 src/operator/tensor/elemwise_unary_op_basic.cc     |   1 +
 src/operator/tensor/histogram.cc                   |   1 +
 src/operator/tensor/matrix_op-inl.h                |  17 +-
 src/operator/tensor/matrix_op.cc                   | 207 ++----------
 src/operator/tensor/pseudo2DTranspose_op-inl.cuh   | 348 +++++++++++++++++++++
 tests/nightly/test_large_array.py                  | 147 ++++-----
 tests/nightly/test_large_vector.py                 |  20 +-
 tests/python/unittest/common.py                    |  21 ++
 tests/python/unittest/test_numpy_ndarray.py        |  33 +-
 tests/python/unittest/test_numpy_op.py             |  45 ++-
 tests/python/unittest/test_operator.py             |  39 +++
 51 files changed, 1667 insertions(+), 976 deletions(-)

diff --cc src/operator/nn/mkldnn/mkldnn_ops-inl.h
index bba76a3,c0218f4..23f059b
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@@ -35,10 -36,7 +35,9 @@@
  #include <dmlc/logging.h>
  #include <dmlc/optional.h>
  #include <vector>
 +
 +#if MXNET_USE_MKLDNN == 100
  #include <mkldnn.hpp>
- #endif
  
  namespace mxnet {
  namespace op {
@@@ -148,19 -130,9 +132,8 @@@ void MKLDNNReshapeForward(const nnvm::N
                            const NDArray &input,
                            const OpReqType &req,
                            const NDArray &output);
- void MKLDNNFlattenForward(const nnvm::NodeAttrs &attrs,
-                           const OpContext &ctx,
-                           const NDArray &input,
-                           const OpReqType &req,
-                           const NDArray &output);
- void MKLDNNExpandDimsForward(const nnvm::NodeAttrs &attrs,
-                              const OpContext &ctx,
-                              const NDArray &input,
-                              const OpReqType &req,
-                              const NDArray &output);
- #endif
--
  }  // namespace op
  }  // namespace mxnet
 -#endif  // MXNET_USE_MKLDNN == 1
  
++#endif  // MXNET_USE_MKLDNN == 100
  #endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_
diff --cc src/operator/nn/mkldnn/mkldnn_reshape-inl.h
index aa0f11c,726d721..8c6d38e
--- a/src/operator/nn/mkldnn/mkldnn_reshape-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_reshape-inl.h
@@@ -1,91 -1,59 +1,61 @@@
--/*
-- * Licensed to the Apache Software Foundation (ASF) under one
-- * or more contributor license agreements.  See the NOTICE file
-- * distributed with this work for additional information
-- * regarding copyright ownership.  The ASF licenses this file
-- * to you under the Apache License, Version 2.0 (the
-- * "License"); you may not use this file except in compliance
-- * with the License.  You may obtain a copy of the License at
-- *
-- *   http://www.apache.org/licenses/LICENSE-2.0
-- *
-- * Unless required by applicable law or agreed to in writing,
-- * software distributed under the License is distributed on an
-- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- * KIND, either express or implied.  See the License for the
-- * specific language governing permissions and limitations
-- * under the License.
-- */
--
--/*!
-- *  Copyright (c) 2019 by Contributors
-- * \file mkldnn_reshape-inl.h
-- * \brief Function definition of mkldnn reshape operator
-- */
--
--#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
--#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
--
- #if MXNET_USE_MKLDNN == 100
 -#if MXNET_USE_MKLDNN == 1
--#include <vector>
--#include "mkldnn_base-inl.h"
--#include "../../tensor/matrix_op-inl.h"
--
--namespace mxnet {
--namespace op {
--
--class MKLDNNReshapeFwd {
-  protected:
 - public:
 -  MKLDNNReshapeFwd(const OpReqType &req, const NDArray &input, const NDArray &output);
 -  int GetWorkspaceSize();
 -  void SetNewMem(const NDArray &input, const NDArray &output, void *workspace = nullptr);
 -  void Execute(const NDArray &input, const NDArray &output, void *workspace = nullptr);
 -
 - private:
 -  std::shared_ptr<mkldnn::memory> data_;
--  std::shared_ptr<mkldnn::memory> out_;
--  std::shared_ptr<mkldnn::memory> temp_;
--  std::vector<mkldnn::primitive> prims_;
-   bool needInvalidateInput = false;
- 
-  public:
-   MKLDNNReshapeFwd(const OpReqType &req,
-                    const NDArray &input,
-                    const NDArray &output);
-   int GetWorkspaceSize();
-   void Execute(const NDArray &input,
-                const NDArray &output,
-                const OpReqType &req,
-                void* workspace = nullptr);
--};
- 
- typedef ParamOpSign<ReshapeParam> MKLDNNReshapeSignature;
- 
- template<typename MKLDNNOpFwdType, typename ParamType, typename MKLDNNSigatureType>
- MKLDNNOpFwdType &GetCachedForward(const ParamType& param,
-                                   const OpReqType &req,
-                                   const NDArray &input,
-                                   const NDArray &output) {
- #if DMLC_CXX11_THREAD_LOCAL
-   static thread_local std::unordered_map<MKLDNNSigatureType,
-                                          MKLDNNOpFwdType, OpHash> fwds;
- #else
-   static MX_THREAD_LOCAL std::unordered_map<MKLDNNSigatureType,
-                                             MKLDNNOpFwdType, OpHash> fwds;
- #endif
-   MKLDNNSigatureType key(param);
-   key.AddSign(req);
-   key.AddSign(input);
-   key.AddSign(output);
- 
-   auto it = fwds.find(key);
-   if (it == fwds.end()) {
-     MKLDNNOpFwdType fwd(req, input, output);
-     it = AddToCache(&fwds, key, fwd);
-   }
-   return it->second;
- }
--
- MKLDNNReshapeFwd &GetReshapeForward(const ReshapeParam& param,
-                                     const OpReqType &req,
-                                     const NDArray &input,
 -typedef OpSignature MKLDNNReshapeSignature;
 -MKLDNNReshapeFwd &GetReshapeForward(const OpReqType &req, const NDArray &input,
--                                    const NDArray &output);
--
--}  // namespace op
--}  // namespace mxnet
--
--#endif  // MXNET_USE_MKLDNN == 1
--#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
++/*
++ * Licensed to the Apache Software Foundation (ASF) under one
++ * or more contributor license agreements.  See the NOTICE file
++ * distributed with this work for additional information
++ * regarding copyright ownership.  The ASF licenses this file
++ * to you under the Apache License, Version 2.0 (the
++ * "License"); you may not use this file except in compliance
++ * with the License.  You may obtain a copy of the License at
++ *
++ *   http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing,
++ * software distributed under the License is distributed on an
++ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++ * KIND, either express or implied.  See the License for the
++ * specific language governing permissions and limitations
++ * under the License.
++ */
++
++/*!
++ *  Copyright (c) 2019 by Contributors
++ * \file mkldnn_reshape-inl.h
++ * \brief Function definition of mkldnn reshape operator
++ */
++
++#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
++#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
++
++#if MXNET_USE_MKLDNN == 100
++#include <vector>
++#include "mkldnn_base-inl.h"
++#include "../../tensor/matrix_op-inl.h"
++
++namespace mxnet {
++namespace op {
++
++class MKLDNNReshapeFwd {
++ protected:
++  std::shared_ptr<mkldnn::memory> out_;
++  std::shared_ptr<mkldnn::memory> temp_;
++  std::vector<mkldnn::primitive> prims_;
++
++ public:
++  MKLDNNReshapeFwd(const OpReqType &req,
++                   const NDArray &input,
++                   const NDArray &output);
++  int GetWorkspaceSize();
++  void Execute(const NDArray &input,
++               const NDArray &output,
++               const OpReqType &req,
++               void* workspace = nullptr);
++};
++
++typedef OpSignature MKLDNNReshapeSignature;
++MKLDNNReshapeFwd &GetReshapeForward(const OpReqType &req, const NDArray &input,
++                                    const NDArray &output);
++}  // namespace op
++}  // namespace mxnet
++
++#endif  // MXNET_USE_MKLDNN == 100
++#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_RESHAPE_INL_H_
diff --cc src/operator/nn/mkldnn/mkldnn_reshape.cc
index d180125,9c226a0..1c1e72f
--- a/src/operator/nn/mkldnn/mkldnn_reshape.cc
+++ b/src/operator/nn/mkldnn/mkldnn_reshape.cc
@@@ -23,62 -23,38 +23,39 @@@
   * \author Tao Lv
  */
  
 -#if MXNET_USE_MKLDNN == 1
 +#if MXNET_USE_MKLDNN == 100
- 
- #include <mkldnn.hpp>
- #include "mkldnn_reshape-inl.h"
+ #include "../../tensor/elemwise_unary_op.h"
+ #include "./mkldnn_ops-inl.h"
+ #include "./mkldnn_base-inl.h"
+ #include "./mkldnn_reshape-inl.h"
  
  namespace mxnet {
  namespace op {
  
- bool SupportMKLDNNReshape(const NDArray &in_data,
-                           const NDArray &out_data) {
-   auto in_ndim = in_data.shape().ndim();
-   auto out_ndim = out_data.shape().ndim();
- 
-   if (in_ndim > 4 ||
-       in_data.dtype() != mshadow::kFloat32 ||
-       out_ndim > 4)
-     return false;
- 
-   return true;
- }
- 
- MKLDNNReshapeFwd::MKLDNNReshapeFwd(const OpReqType &req,
-                                    const NDArray &input,
+ MKLDNNReshapeFwd::MKLDNNReshapeFwd(const OpReqType &req, const NDArray &input,
                                     const NDArray &output) {
-   auto engine = CpuEngine::Get()->get_engine();
- 
-   // source
+   const auto engine = CpuEngine::Get()->get_engine();
 -  data_ = std::make_shared<mkldnn::memory>(input.GetMKLDNNData()->get_primitive_desc(), nullptr);
 +  auto in_mem = input.GetMKLDNNData();
-   auto in_md = in_mem->get_desc();
- 
-   // temp_
-   auto temp_md = GetDesc(in_md, GetDefaultFormat(in_md));
-   temp_ = std::make_shared<mkldnn::memory>(temp_md, engine, nullptr);
 +
-   // destination
-   out_ = std::make_shared<mkldnn::memory>(temp_md, engine, nullptr);
+   // Create temp memory
+   auto temp_dims = mkldnn::memory::dims(input.shape().begin(), input.shape().end());
+   auto temp_type = static_cast<mkldnn::memory::data_type>(get_mkldnn_type(input.dtype()));
 -  auto temp_fmt = static_cast<mkldnn::memory::format>(GetDefaultFormat(input.shape().ndim()));
++  auto temp_fmt = static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(input.shape().ndim()));
+   auto temp_desc = mkldnn::memory::desc(temp_dims, temp_type, temp_fmt);
 -  auto temp_pd = mkldnn::memory::primitive_desc(temp_desc, engine);
 -  out_ = std::make_shared<mkldnn::memory>(temp_pd, nullptr);
 +
++  out_ = std::make_shared<mkldnn::memory>(temp_desc, engine, nullptr);
    if (req == kWriteInplace) {
      // If the input has MKL-DNN internal layout, we need reorder it to a temporal buffer with
      // default layout and copy from the temporal buffer back to output buffer which has the same
      // address with input buffer.
      // If the input has default layout, then nothing need to do.
      if (input.IsMKLDNNData()) {
 -      temp_ = std::make_shared<mkldnn::memory>(temp_pd, nullptr);
 -      prims_.push_back(mkldnn::reorder(*data_, *temp_));  // reorder to default
++      temp_ = std::make_shared<mkldnn::memory>(temp_desc, engine, nullptr);
 +      prims_.push_back(mkldnn::reorder(*in_mem, *temp_));  // reorder to default
        prims_.push_back(mkldnn::reorder(*temp_, *out_));   // copy back
-       needInvalidateInput = true;
      }
    } else if (req == kWriteTo) {
-     if (input.IsMKLDNNData()) {
-       prims_.push_back(mkldnn::reorder(*in_mem, *temp_));   // reorder to default
-       prims_.push_back(mkldnn::reorder(*temp_, *out_));     // copy to the output buffer
-       needInvalidateInput = false;
-     } else {
-       prims_.push_back(mkldnn::reorder(*in_mem, *out_));    // copy directly from input to output
-       needInvalidateInput = false;
-     }
 -    prims_.push_back(mkldnn::reorder(*data_, *out_));
++    prims_.push_back(mkldnn::reorder(*in_mem, *out_));
    } else {
      LOG(FATAL) << "not supported req type: " << req;
    }
@@@ -90,37 -76,41 +67,57 @@@ int MKLDNNReshapeFwd::GetWorkspaceSize(
  
  void MKLDNNReshapeFwd::Execute(const NDArray &input,
                                 const NDArray &output,
 +                               const OpReqType &req,
                                 void* workspace) {
 -  if (this->prims_.size()) {
 -    // set memory handles
 -    SetNewMem(input, output, workspace);
 -    // register primitives
 -    auto stream = MKLDNNStream::Get();
 -    for (auto &v : this->prims_) {
 -      stream->RegisterPrim(v);
 +  auto stream = MKLDNNStream::Get();
 +  auto in_mem = input.GetMKLDNNData();
 +  // register primitives and arguments
 +  std::vector<mkldnn_args_map_t> args_map;
 +  size_t prims_size = prims_.size();
 +  if (prims_size == 1) {
 +    args_map.push_back({{MKLDNN_ARG_FROM, *in_mem},
 +                        {MKLDNN_ARG_TO, *output.GetMKLDNNData()}});
 +  } else if (prims_size == 2) {
 +    if (workspace) {
 +      temp_->set_data_handle(workspace);
      }
 -    stream->Submit();
 +    args_map.push_back({{MKLDNN_ARG_FROM, *in_mem},
 +                        {MKLDNN_ARG_TO, *temp_}});
 +    args_map.push_back({{MKLDNN_ARG_FROM, *temp_},
 +                        {MKLDNN_ARG_TO, *output.GetMKLDNNData()}});
 +  } else {
 +    CHECK(prims_size == 0 && req != kWriteTo)
 +          << "kWriteTo should never reach here.";
 +  }
 +
 +  for (size_t i = 0; i < prims_size; i++) {
 +    stream->RegisterPrimArgs(prims_[i], args_map[i]);
    }
 +  stream->Submit();
-   // invalidate mkldnn memory in input
-   if (needInvalidateInput) {
-     const_cast<NDArray &>(input).InvalidateMKLDNNData();
+   // invalidate mkldnn memory in output
+   const_cast<NDArray &>(output).InvalidateMKLDNNData();
+ }
+ 
+ MKLDNNReshapeFwd &GetReshapeForward(const OpReqType &req,
+                                     const NDArray &input,
+                                     const NDArray &output) {
+ #if DMLC_CXX11_THREAD_LOCAL
+   static thread_local std::unordered_map<MKLDNNReshapeSignature,
+                                          MKLDNNReshapeFwd, OpHash> fwds;
+ #else
+   static MX_THREAD_LOCAL std::unordered_map<MKLDNNReshapeSignature,
+                                             MKLDNNReshapeFwd, OpHash> fwds;
+ #endif
+   MKLDNNReshapeSignature key;
+   key.AddSign(req);
+   key.AddSign(input);
+ 
+   auto it = fwds.find(key);
+   if (it == fwds.end()) {
+     MKLDNNReshapeFwd fwd(req, input, output);
+     it = AddToCache(&fwds, key, fwd);
    }
+   return it->second;
  }
  
  void MKLDNNReshapeForward(const nnvm::NodeAttrs& attrs,
@@@ -141,11 -135,11 +142,11 @@@
      mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
      mshadow::Tensor<cpu, 1, char> ws = ctx.requested[0]
        .get_space_typed<cpu, 1, char>(mshadow::Shape1(ws_size), s);
-     ws_ptr = reinterpret_cast<void*>(ws.dptr_);
+     ws_ptr = static_cast<void*>(ws.dptr_);
    }
- 
 -  fwd.Execute(input, output, ws_ptr);
 +  fwd.Execute(input, output, req, ws_ptr);
  }
+ 
  }  // namespace op
  }  // namespace mxnet
  #endif
diff --cc src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc
index 2416c12,c059f98..d50f968
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_flatten.cc
@@@ -23,8 -23,8 +23,8 @@@
   * \brief
   */
  
 -#if MXNET_USE_MKLDNN == 1
 +#if MXNET_USE_MKLDNN == 100
- #include "../../nn/mkldnn/mkldnn_flatten-inl.h"
+ #include "../../nn/mkldnn/mkldnn_ops-inl.h"
  #include "../quantization_utils.h"
  
  namespace mxnet {
diff --cc src/operator/tensor/matrix_op.cc
index 6bf1ec0,99fba15..bd683c9
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@@ -116,13 -114,9 +116,9 @@@ static void ReshapeComputeExCPU(const n
    CHECK_EQ(inputs.size(), 1U);
    CHECK_EQ(outputs.size(), 1U);
    // If inputs are supposed to be in MKLDNN format and
--  // MKLDNNsupport the data type or the shape. Then convert
++  // MKLDNN support the data type or the shape. Then convert
    // it to the output format and shape
-   if (SupportMKLDNNReshape(inputs[0], outputs[0])) {
-     MKLDNNReshapeForward(attrs, ctx, inputs[0], req[0], outputs[0]);
-     return;
-   }
-   FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
+   MKLDNNReshapeForward(attrs, ctx, inputs[0], req[0], outputs[0]);
  }
  
  inline static bool ReshapeStorageType(const nnvm::NodeAttrs& attrs,
@@@ -140,66 -134,66 +136,42 @@@
  NNVM_REGISTER_OP(Reshape)
  .add_alias("reshape")
  .describe(R"code(Reshapes the input array.
--
  .. note:: ``Reshape`` is deprecated, use ``reshape``
--
  Given an array and a shape, this function returns a copy of the array in the new shape.
  The shape is a tuple of integers such as (2,3,4). The size of the new shape should be same as the size of the input array.
--
  Example::
--
    reshape([1,2,3,4], shape=(2,2)) = [[1,2], [3,4]]
--
  Some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}. The significance of each is explained below:
--
  - ``0``  copy this dimension from the input to the output shape.
--
    Example::
--
    - input shape = (2,3,4), shape = (4,0,2), output shape = (4,3,2)
    - input shape = (2,3,4), shape = (2,0,0), output shape = (2,3,4)
--
  - ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
    keeping the size of the new array same as that of the input array.
    At most one dimension of shape can be -1.
--
    Example::
--
    - input shape = (2,3,4), shape = (6,1,-1), output shape = (6,1,4)
    - input shape = (2,3,4), shape = (3,-1,8), output shape = (3,1,8)
    - input shape = (2,3,4), shape=(-1,), output shape = (24,)
--
  - ``-2`` copy all/remainder of the input dimensions to the output shape.
--
    Example::
--
    - input shape = (2,3,4), shape = (-2,), output shape = (2,3,4)
    - input shape = (2,3,4), shape = (2,-2), output shape = (2,3,4)
    - input shape = (2,3,4), shape = (-2,1,1), output shape = (2,3,4,1,1)
--
  - ``-3`` use the product of two consecutive dimensions of the input shape as the output dimension.
--
    Example::
--
    - input shape = (2,3,4), shape = (-3,4), output shape = (6,4)
    - input shape = (2,3,4,5), shape = (-3,-3), output shape = (6,20)
    - input shape = (2,3,4), shape = (0,-3), output shape = (2,12)
    - input shape = (2,3,4), shape = (-3,-2), output shape = (6,4)
--
  - ``-4`` split one dimension of the input into two dimensions passed subsequent to -4 in shape (can contain -1).
--
    Example::
--
    - input shape = (2,3,4), shape = (-4,1,2,-2), output shape =(1,2,3,4)
    - input shape = (2,3,4), shape = (2,-4,-1,3,-2), output shape = (2,1,3,4)
--
  If the argument `reverse` is set to 1, then the special values are inferred from right to left.
--
    Example::
--
    - without reverse=1, for input shape = (10,5,4), shape = (-1,0), output shape would be (40,5)
    - with reverse=1, output shape will be (50,4).
--
  )code" ADD_FILELINE)
  .set_num_inputs(1)
  .set_num_outputs(1)
@@@ -227,6 -221,7 +199,7 @@@
  .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.")
  .add_arguments(ReshapeParam::__FIELDS__());
  
 -#if MXNET_USE_MKLDNN == 1
++#if MXNET_USE_MKLDNN == 100
  static void FlattenEx(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
                        const std::vector<NDArray>& inputs,
@@@ -234,22 -229,12 +207,12 @@@
                        const std::vector<NDArray>& outputs) {
    CHECK_EQ(inputs.size(), 1U);
    CHECK_EQ(outputs.size(), 1U);
- #if MXNET_USE_MKLDNN == 100
-   auto data_ndim = inputs[0].shape().ndim();
-   if (data_ndim <= 4 && inputs[0].dtype() == mshadow::kFloat32) {
-     MKLDNNFlattenForward(attrs, ctx, inputs[0], req[0], outputs[0]);
-     return;
-   } else {
-     // This happens if inputs are supposed to be in MKLDNN format
-     // but MKLDNN doesn't support the data type or the shape. We're
-     // forced to convert it to the default format.
-     FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
-     return;
-   }
- #endif
+   // If inputs are supposed to be in MKLDNN format and
 -  // MKLDNNsupport the data type or the shape. Then convert
++  // MKLDNN support the data type or the shape. Then convert
+   // it to the output format and shape
+   MKLDNNReshapeForward(attrs, ctx, inputs[0], req[0], outputs[0]);
  }
  
- #if MXNET_USE_MKLDNN == 100
  static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
                                        const int dev_mask,
                                        DispatchMode* dispatch_mode,
@@@ -266,17 -251,17 +229,12 @@@ NNVM_REGISTER_OP(Flatten
  .add_alias("flatten")
  .add_alias("_npx_batch_flatten")
  .describe(R"code(Flattens the input array into a 2-D array by collapsing the higher dimensions.
--
  .. note:: `Flatten` is deprecated. Use `flatten` instead.
--
  For an input array with shape ``(d1, d2, ..., dk)``, `flatten` operation reshapes
  the input array into an output array of shape ``(d1, d2*...*dk)``.
--
  Note that the behavior of this function is different from numpy.ndarray.flatten,
  which behaves similar to mxnet.ndarray.reshape((-1,)).
--
  Example::
--
      x = [[
          [1,2,3],
          [4,5,6],
@@@ -286,23 -271,21 +244,19 @@@
          [4,5,6],
          [7,8,9]
      ]],
--
      flatten(x) = [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.],
         [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.]]
--
  )code" ADD_FILELINE)
  .set_num_inputs(1)
  .set_num_outputs(1)
  .set_attr<mxnet::FInferShape>("FInferShape", FlattenShape)
  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
- #if MXNET_USE_MKLDNN == 100
- .set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
- #endif
  .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_copy" })
  .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
- .set_attr<FComputeEx>("FComputeEx<cpu>", FlattenEx)
 -#if MXNET_USE_MKLDNN == 1
 +#if MXNET_USE_MKLDNN == 100
  .set_attr<bool>("TIsMKLDNN", true)
+ .set_attr<FComputeEx>("FComputeEx<cpu>", FlattenEx)
+ .set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
  .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
  })
@@@ -351,30 -334,30 +305,21 @@@ inline static bool TransposeStorageType
  
  NNVM_REGISTER_OP(transpose)
  .describe(R"code(Permutes the dimensions of an array.
--
  Examples::
--
    x = [[ 1, 2],
         [ 3, 4]]
--
    transpose(x) = [[ 1.,  3.],
                    [ 2.,  4.]]
--
    x = [[[ 1.,  2.],
          [ 3.,  4.]],
--
         [[ 5.,  6.],
          [ 7.,  8.]]]
--
    transpose(x) = [[[ 1.,  5.],
                     [ 3.,  7.]],
--
                    [[ 2.,  6.],
                     [ 4.,  8.]]]
--
    transpose(x, axes=(1,0,2)) = [[[ 1.,  2.],
                                   [ 5.,  6.]],
--
                                  [[ 3.,  4.],
                                   [ 7.,  8.]]]
  )code" ADD_FILELINE)
@@@ -412,40 -395,13 +357,36 @@@
  .add_arguments(TransposeParam::__FIELDS__());
  
  
 +#if MXNET_USE_MKLDNN == 100
 +static void ExpandDimEx(const nnvm::NodeAttrs& attrs,
 +                        const OpContext& ctx,
 +                        const std::vector<NDArray>& inputs,
 +                        const std::vector<OpReqType>& req,
 +                        const std::vector<NDArray>& outputs) {
 +  CHECK_EQ(inputs.size(), 1U);
 +  CHECK_EQ(outputs.size(), 1U);
-   auto data_ndim = inputs[0].shape().ndim();
-   if (data_ndim <= 3 && inputs[0].dtype() == mshadow::kFloat32) {
-     MKLDNNExpandDimsForward(attrs, ctx, inputs[0], req[0], outputs[0]);
-     return;
-   }
-   FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
++  // If inputs are supposed to be in MKLDNN format and
++  // MKLDNN support the data type or the shape. Then convert
++  // it to the output format and shape
++  MKLDNNReshapeForward(attrs, ctx, inputs[0], req[0], outputs[0]);
 +}
 +
 +inline static bool ExpandDimStorageType(const nnvm::NodeAttrs& attrs,
 +                                        const int dev_mask,
 +                                        DispatchMode* dispatch_mode,
 +                                        std::vector<int>* in_attrs,
 +                                        std::vector<int>* out_attrs) {
 +  CHECK_EQ(in_attrs->size(), 1U);
 +  CHECK_EQ(out_attrs->size(), 1U);
 +  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
 +}
 +#endif
 +
  NNVM_REGISTER_OP(expand_dims)
  .add_alias("_npi_expand_dims")
  .describe(R"code(Inserts a new axis of size 1 into the array shape
--
  For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1)``
  will return a new array with shape ``(2,1,3,4)``.
--
  )code" ADD_FILELINE)
  .set_num_inputs(1)
  .set_num_outputs(1)
@@@ -465,13 -418,6 +403,14 @@@
    })
  .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_reshape"})
  .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 +#if MXNET_USE_MKLDNN == 100
- .set_attr<FComputeEx>("FComputeEx<cpu>", ExpandDimEx)
 +.set_attr<bool>("TIsMKLDNN", true)
++.set_attr<FComputeEx>("FComputeEx<cpu>", ExpandDimEx)
++.set_attr<FInferStorageType>("FInferStorageType", ExpandDimStorageType)
 +.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
 +  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
 +})
 +#endif
  .add_argument("data", "NDArray-or-Symbol", "Source input")
  .add_arguments(ExpandDimParam::__FIELDS__());
  
@@@ -503,44 -449,44 +442,33 @@@ NNVM_REGISTER_OP(slice
  MXNET_ADD_SPARSE_OP_ALIAS(slice)
  .add_alias("crop")
  .describe(R"code(Slices a region of the array.
--
  .. note:: ``crop`` is deprecated. Use ``slice`` instead.
--
  This function returns a sliced array between the indices given
  by `begin` and `end` with the corresponding `step`.
--
  For an input array of ``shape=(d_0, d_1, ..., d_n-1)``,
  slice operation with ``begin=(b_0, b_1...b_m-1)``,
  ``end=(e_0, e_1, ..., e_m-1)``, and ``step=(s_0, s_1, ..., s_m-1)``,
  where m <= n, results in an array with the shape
  ``(|e_0-b_0|/|s_0|, ..., |e_m-1-b_m-1|/|s_m-1|, d_m, ..., d_n-1)``.
--
  The resulting array's *k*-th dimension contains elements
  from the *k*-th dimension of the input array starting
  from index ``b_k`` (inclusive) with step ``s_k``
  until reaching ``e_k`` (exclusive).
--
  If the *k*-th elements are `None` in the sequence of `begin`, `end`,
  and `step`, the following rule will be used to set default values.
  If `s_k` is `None`, set `s_k=1`. If `s_k > 0`, set `b_k=0`, `e_k=d_k`;
  else, set `b_k=d_k-1`, `e_k=-1`.
--
  The storage type of ``slice`` output depends on storage types of inputs
--
  - slice(csr) = csr
  - otherwise, ``slice`` generates output with default storage
--
  .. note:: When input data storage type is csr, it only supports
     step=(), or step=(None,), or step=(1,) to generate a csr output.
     For other step parameter values, it falls back to slicing
     a dense tensor.
--
  Example::
--
    x = [[  1.,   2.,   3.,   4.],
         [  5.,   6.,   7.,   8.],
         [  9.,  10.,  11.,  12.]]
--
    slice(x, begin=(0,1), end=(2,4)) = [[ 2.,  3.,  4.],
                                       [ 6.,  7.,  8.]]
    slice(x, begin=(None, 0), end=(None, 3), step=(-1, 2)) = [[9., 11.],
@@@ -620,23 -566,23 +548,17 @@@ NNVM_REGISTER_OP(_slice_assign_scalar
  
  NNVM_REGISTER_OP(slice_axis)
  .describe(R"code(Slices along a given axis.
--
  Returns an array slice along a given `axis` starting from the `begin` index
  to the `end` index.
--
  Examples::
--
    x = [[  1.,   2.,   3.,   4.],
         [  5.,   6.,   7.,   8.],
         [  9.,  10.,  11.,  12.]]
--
    slice_axis(x, axis=0, begin=1, end=3) = [[  5.,   6.,   7.,   8.],
                                             [  9.,  10.,  11.,  12.]]
--
    slice_axis(x, axis=1, begin=0, end=2) = [[  1.,   2.],
                                             [  5.,   6.],
                                             [  9.,  10.]]
--
    slice_axis(x, axis=1, begin=-3, end=-1) = [[  2.,   3.],
                                               [  6.,   7.],
                                               [ 10.,  11.]]
@@@ -660,46 -606,46 +582,31 @@@ NNVM_REGISTER_OP(_backward_slice_axis
  
  NNVM_REGISTER_OP(slice_like)
  .describe(R"code(Slices a region of the array like the shape of another array.
--
  This function is similar to ``slice``, however, the `begin` are always `0`s
  and `end` of specific axes are inferred from the second input `shape_like`.
--
  Given the second `shape_like` input of ``shape=(d_0, d_1, ..., d_n-1)``,
  a ``slice_like`` operator with default empty `axes`, it performs the
  following operation:
--
  `` out = slice(input, begin=(0, 0, ..., 0), end=(d_0, d_1, ..., d_n-1))``.
--
  When `axes` is not empty, it is used to speficy which axes are being sliced.
--
  Given a 4-d input data, ``slice_like`` operator with ``axes=(0, 2, -1)``
  will perform the following operation:
--
  `` out = slice(input, begin=(0, 0, 0, 0), end=(d_0, None, d_2, d_3))``.
--
  Note that it is allowed to have first and second input with different dimensions,
  however, you have to make sure the `axes` are specified and not exceeding the
  dimension limits.
--
  For example, given `input_1` with ``shape=(2,3,4,5)`` and `input_2` with
  ``shape=(1,2,3)``, it is not allowed to use:
--
  `` out = slice_like(a, b)`` because ndim of `input_1` is 4, and ndim of `input_2`
  is 3.
--
  The following is allowed in this situation:
--
  `` out = slice_like(a, b, axes=(0, 2))``
--
  Example::
--
    x = [[  1.,   2.,   3.,   4.],
         [  5.,   6.,   7.,   8.],
         [  9.,  10.,  11.,  12.]]
--
    y = [[  0.,   0.,   0.],
         [  0.,   0.,   0.]]
--
    slice_like(x, y) = [[ 1.,  2.,  3.]
                        [ 5.,  6.,  7.]]
    slice_like(x, y, axes=(0, 1)) = [[ 1.,  2.,  3.]
@@@ -745,23 -691,23 +652,15 @@@ NNVM_REGISTER_OP(clip
  MXNET_ADD_SPARSE_OP_ALIAS(clip)
  .add_alias("_npi_clip")
  .describe(R"code(Clips (limits) the values in an array.
--
  Given an interval, values outside the interval are clipped to the interval edges.
  Clipping ``x`` between `a_min` and `a_max` would be::
--
  .. math::
--
     clip(x, a_min, a_max) = \max(\min(x, a_max), a_min))
--
  Example::
--
      x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
--
      clip(x,1,8) = [ 1.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  8.]
--
  The storage type of ``clip`` output depends on storage types of inputs and the a_min, a_max \
  parameter values:
--
     - clip(default) = default
     - clip(row_sparse, a_min <= 0, a_max >= 0) = row_sparse
     - clip(csr, a_min <= 0, a_max >= 0) = csr
@@@ -769,7 -715,7 +668,6 @@@
     - clip(row_sparse, a_min > 0, a_max > 0) = default
     - clip(csr, a_min < 0, a_max < 0) = csr
     - clip(csr, a_min > 0, a_max > 0) = csr
--
  )code" ADD_FILELINE)
  .set_num_inputs(1)
  .set_num_outputs(1)
@@@ -823,28 -769,28 +721,20 @@@ NNVM_REGISTER_OP(_backward_clip
  NNVM_REGISTER_OP(repeat)
  .add_alias("_np_repeat")
  .describe(R"code(Repeats elements of an array.
--
  By default, ``repeat`` flattens the input array into 1-D and then repeats the
  elements::
--
    x = [[ 1, 2],
         [ 3, 4]]
--
    repeat(x, repeats=2) = [ 1.,  1.,  2.,  2.,  3.,  3.,  4.,  4.]
--
  The parameter ``axis`` specifies the axis along which to perform repeat::
--
    repeat(x, repeats=2, axis=1) = [[ 1.,  1.,  2.,  2.],
                                    [ 3.,  3.,  4.,  4.]]
--
    repeat(x, repeats=2, axis=0) = [[ 1.,  2.],
                                    [ 1.,  2.],
                                    [ 3.,  4.],
                                    [ 3.,  4.]]
--
    repeat(x, repeats=2, axis=-1) = [[ 1.,  1.,  2.,  2.],
                                     [ 3.,  3.,  4.,  4.]]
--
  )code" ADD_FILELINE)
  .set_num_outputs(1)
  .set_num_inputs(1)
@@@ -874,35 -820,35 +764,25 @@@ NNVM_REGISTER_OP(_backward_repeat
  NNVM_REGISTER_OP(tile)
  .add_alias("_npi_tile")
  .describe(R"code(Repeats the whole array multiple times.
--
  If ``reps`` has length *d*, and input array has dimension of *n*. There are
  three cases:
--
  - **n=d**. Repeat *i*-th dimension of the input by ``reps[i]`` times::
--
      x = [[1, 2],
           [3, 4]]
--
      tile(x, reps=(2,3)) = [[ 1.,  2.,  1.,  2.,  1.,  2.],
                             [ 3.,  4.,  3.,  4.,  3.,  4.],
                             [ 1.,  2.,  1.,  2.,  1.,  2.],
                             [ 3.,  4.,  3.,  4.,  3.,  4.]]
--
  - **n>d**. ``reps`` is promoted to length *n* by pre-pending 1's to it. Thus for
    an input shape ``(2,3)``, ``repos=(2,)`` is treated as ``(1,2)``::
--
--
      tile(x, reps=(2,)) = [[ 1.,  2.,  1.,  2.],
                            [ 3.,  4.,  3.,  4.]]
--
  - **n<d**. The input is promoted to be d-dimensional by prepending new axes. So a
    shape ``(2,2)`` array is promoted to ``(1,2,2)`` for 3-D replication::
--
      tile(x, reps=(2,2,3)) = [[[ 1.,  2.,  1.,  2.,  1.,  2.],
                                [ 3.,  4.,  3.,  4.,  3.,  4.],
                                [ 1.,  2.,  1.,  2.,  1.,  2.],
                                [ 3.,  4.,  3.,  4.,  3.,  4.]],
--
                               [[ 1.,  2.,  1.,  2.,  1.,  2.],
                                [ 3.,  4.,  3.,  4.,  3.,  4.],
                                [ 1.,  2.,  1.,  2.,  1.,  2.],
@@@ -935,17 -881,17 +815,12 @@@ NNVM_REGISTER_OP(_backward_tile
  
  NNVM_REGISTER_OP(reverse)
  .describe(R"code(Reverses the order of elements along given axis while preserving array shape.
--
  Note: reverse and flip are equivalent. We use reverse in the following examples.
--
  Examples::
--
    x = [[ 0.,  1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.,  9.]]
--
    reverse(x, axis=0) = [[ 5.,  6.,  7.,  8.,  9.],
                          [ 0.,  1.,  2.,  3.,  4.]]
--
    reverse(x, axis=1) = [[ 4.,  3.,  2.,  1.,  0.],
                          [ 9.,  8.,  7.,  6.,  5.]]
  )code" ADD_FILELINE)
@@@ -981,16 -927,16 +856,12 @@@ NNVM_REGISTER_OP(_backward_reverse
  
  NNVM_REGISTER_OP(stack)
  .describe(R"code(Join a sequence of arrays along a new axis.
--
  The axis parameter specifies the index of the new axis in the dimensions of the
  result. For example, if axis=0 it will be the first dimension and if axis=-1 it
  will be the last dimension.
--
  Examples::
--
    x = [1, 2]
    y = [3, 4]
--
    stack(x, y) = [[1, 2],
                   [3, 4]]
    stack(x, y, axis=1) = [[1, 3],
@@@ -1033,15 -979,15 +904,12 @@@ NNVM_REGISTER_OP(squeeze
  .describe(R"code(Remove single-dimensional entries from the shape of an array.
  Same behavior of defining the output tensor shape as numpy.squeeze for the most of cases.
  See the following note for exception.
--
  Examples::
--
    data = [[[0], [1], [2]]]
    squeeze(data) = [0, 1, 2]
    squeeze(data, axis=0) = [[0], [1], [2]]
    squeeze(data, axis=2) = [[0, 1, 2]]
    squeeze(data, axis=(0, 2)) = [0, 1, 2]
--
  .. Note::
    The output of this operator will keep at least one dimension not removed. For example,
    squeeze([[[4]]]) = [4], while in numpy.squeeze, the output will become a scalar.
@@@ -1071,22 -1017,22 +939,17 @@@ NNVM_REGISTER_OP(depth_to_space
  .describe(R"code(Rearranges(permutes) data from depth into blocks of spatial data.
  Similar to ONNX DepthToSpace operator:
  https://github.com/onnx/onnx/blob/master/docs/Operators.md#DepthToSpace.
- The output is a new tensor where the values from depth dimension are moved in spatial blocks 
+ The output is a new tensor where the values from depth dimension are moved in spatial blocks
  to height and width dimension. The reverse of this operation is ``space_to_depth``.
--
  .. math::
--
      \begin{gather*}
      x \prime = reshape(x, [N, block\_size, block\_size, C / (block\_size ^ 2), H * block\_size, W * block\_size]) \\
      x \prime \prime = transpose(x \prime, [0, 3, 4, 1, 5, 2]) \\
      y = reshape(x \prime \prime, [N, C / (block\_size ^ 2), H * block\_size, W * block\_size])
      \end{gather*}
--
- where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width] 
+ where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width]
  and :math:`y` is the output tensor of layout :math:`[N, C / (block\_size ^ 2), H * block\_size, W * block\_size]`
--
  Example::
--
    x = [[[[0, 1, 2],
           [3, 4, 5]],
          [[6, 7, 8],
@@@ -1095,7 -1041,7 +958,6 @@@
           [15, 16, 17]],
          [[18, 19, 20],
           [21, 22, 23]]]]
--
    depth_to_space(x, 2) = [[[[0, 6, 1, 7, 2, 8],
                              [12, 18, 13, 19, 14, 20],
                              [3, 9, 4, 10, 5, 11],
@@@ -1122,30 -1068,30 +984,22 @@@
  NNVM_REGISTER_OP(space_to_depth)
  .describe(R"code(Rearranges(permutes) blocks of spatial data into depth.
  Similar to ONNX SpaceToDepth operator:
- https://github.com/onnx/onnx/blob/master/docs/Operators.md#SpaceToDepth 
- 
- The output is a new tensor where the values from height and width dimension are 
+ https://github.com/onnx/onnx/blob/master/docs/Operators.md#SpaceToDepth
 -
+ The output is a new tensor where the values from height and width dimension are
  moved to the depth dimension. The reverse of this operation is ``depth_to_space``.
--
  .. math::
--
      \begin{gather*}
      x \prime = reshape(x, [N, C, H / block\_size, block\_size, W / block\_size, block\_size]) \\
      x \prime \prime = transpose(x \prime, [0, 3, 5, 1, 2, 4]) \\
      y = reshape(x \prime \prime, [N, C * (block\_size ^ 2), H / block\_size, W / block\_size])
      \end{gather*}
--
- where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width] 
+ where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width]
  and :math:`y` is the output tensor of layout :math:`[N, C * (block\_size ^ 2), H / block\_size, W / block\_size]`
--
  Example::
--
    x = [[[[0, 6, 1, 7, 2, 8],
           [12, 18, 13, 19, 14, 20],
           [3, 9, 4, 10, 5, 11],
           [15, 21, 16, 22, 17, 23]]]]
--
--
    space_to_depth(x, 2) = [[[[0, 1, 2],
                              [3, 4, 5]],
                             [[6, 7, 8],
@@@ -1176,9 -1122,9 +1030,7 @@@
  NNVM_REGISTER_OP(_split_v2)
  .add_alias("_npi_split")
  .describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
--
  Example::
--
     x  = [[[ 1.]
            [ 2.]]
           [[ 3.]
@@@ -1186,61 -1132,61 +1038,44 @@@
           [[ 5.]
            [ 6.]]]
     x.shape = (3, 2, 1)
--
     y = split_v2(x, axis=1, indices_or_sections=2) // a list of 2 arrays with shape (3, 1, 1)
     y = [[[ 1.]]
          [[ 3.]]
          [[ 5.]]]
--
         [[[ 2.]]
          [[ 4.]]
          [[ 6.]]]
--
     y[0].shape = (3, 1, 1)
--
     z = split_v2(x, axis=0, indices_or_sections=3) // a list of 3 arrays with shape (1, 2, 1)
     z = [[[ 1.]
           [ 2.]]]
--
         [[[ 3.]
           [ 4.]]]
--
         [[[ 5.]
           [ 6.]]]
--
     z[0].shape = (1, 2, 1)
--
     w = split_v2(x, axis=0, indices_or_sections=(1,)) // a list of 2 arrays with shape [(1, 2, 1), (2, 2, 1)]
     w = [[[ 1.]
           [ 2.]]]
--
         [[[3.]
           [4.]]
--
          [[5.]
           [6.]]]
--
    w[0].shape = (1, 2, 1)
    w[1].shape = (2, 2, 1)
--
  `squeeze_axis=True` removes the axis with length 1 from the shapes of the output arrays.
  **Note** that setting `squeeze_axis` to ``1`` removes axis with length 1 only
  along the `axis` which it is split.
  Also `squeeze_axis` can be set to true only if ``input.shape[axis] == indices_or_sections``.
--
  Example::
--
     z = split_v2(x, axis=0, indices_or_sections=3, squeeze_axis=1) // a list of 3 arrays with shape (2, 1)
     z = [[ 1.]
          [ 2.]]
--
         [[ 3.]
          [ 4.]]
--
         [[ 5.]
          [ 6.]]
     z[0].shape = (2, 1)
--
  )code" ADD_FILELINE)
  .set_attr_parser(ParamParser<SplitParam>)
  .set_num_inputs(1)