You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2021/07/16 14:29:12 UTC
[incubator-mxnet] branch v1.x updated: Auto-formatter to keep the same coding style (#20356)

This is an automated email from the ASF dual-hosted git repository.

zhasheng pushed a commit to branch v1.x
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/v1.x by this push:
     new 0ae1f0c  Auto-formatter to keep the same coding style (#20356)
0ae1f0c is described below

commit 0ae1f0cc6788841dad85aad48699909353f90100
Author: mozga <ma...@intel.com>
AuthorDate: Fri Jul 16 16:27:12 2021 +0200

    Auto-formatter to keep the same coding style (#20356)
    
    * This pull-request contains coding-style chnages
    
    * Sanity chnages
    
    * Sanity chnages: NDArray file
    
    * Remove the same #defines
    
    * Cuda: batch_norm, a duplication was removed
    
    * BinPackParameters was added
    
    * Clang-formatter: constructor param in one line
    
    * Conflict: fix
---
 include/mxnet/ndarray.h                            |  668 ++++----
 src/ndarray/ndarray.cc                             | 1715 +++++++++++---------
 src/operator/nn/batch_norm-inl.h                   |  278 ++--
 src/operator/nn/batch_norm.cc                      |  537 +++---
 src/operator/nn/batch_norm.cu                      |    4 -
 src/operator/nn/mkldnn/mkldnn_act-inl.h            |   57 +-
 src/operator/nn/mkldnn/mkldnn_act.cc               |  192 ++-
 src/operator/nn/mkldnn/mkldnn_base-inl.h           |  425 ++---
 src/operator/nn/mkldnn/mkldnn_base.cc              |  406 ++---
 src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h     |  310 ++--
 src/operator/nn/mkldnn/mkldnn_concat-inl.h         |   20 +-
 src/operator/nn/mkldnn/mkldnn_concat.cc            |   74 +-
 src/operator/nn/mkldnn/mkldnn_convolution-inl.h    |  124 +-
 src/operator/nn/mkldnn/mkldnn_convolution.cc       |  415 +++--
 src/operator/nn/mkldnn/mkldnn_copy.cc              |   23 +-
 src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h  |  325 ++--
 src/operator/nn/mkldnn/mkldnn_deconvolution.cc     |  207 +--
 .../nn/mkldnn/mkldnn_fully_connected-inl.h         |  121 +-
 src/operator/nn/mkldnn/mkldnn_fully_connected.cc   |  257 +--
 src/operator/nn/mkldnn/mkldnn_log_softmax.cc       |  157 +-
 src/operator/nn/mkldnn/mkldnn_lrn-inl.h            |  206 ++-
 src/operator/nn/mkldnn/mkldnn_ops-inl.h            |  187 ++-
 src/operator/nn/mkldnn/mkldnn_pooling-inl.h        |  127 +-
 src/operator/nn/mkldnn/mkldnn_pooling.cc           |  220 +--
 src/operator/nn/mkldnn/mkldnn_reshape-inl.h        |   18 +-
 src/operator/nn/mkldnn/mkldnn_reshape.cc           |   78 +-
 src/operator/nn/mkldnn/mkldnn_rnn-inl.h            |  327 ++--
 src/operator/nn/mkldnn/mkldnn_rnn.cc               |  844 +++++-----
 src/operator/nn/mkldnn/mkldnn_slice-inl.h          |   23 +-
 src/operator/nn/mkldnn/mkldnn_slice.cc             |   50 +-
 src/operator/nn/mkldnn/mkldnn_softmax.cc           |  133 +-
 src/operator/nn/mkldnn/mkldnn_softmax_output.cc    |   64 +-
 src/operator/nn/mkldnn/mkldnn_sum.cc               |   63 +-
 src/operator/nn/mkldnn/mkldnn_transpose.cc         |   54 +-
 src/operator/operator_common.h                     |  346 ++--
 .../quantization/mkldnn/mkldnn_dequantize-inl.h    |   57 +-
 .../quantization/mkldnn/mkldnn_quantize-inl.h      |   35 +-
 .../quantization/mkldnn/mkldnn_quantize_v2-inl.h   |   74 +-
 .../quantization/mkldnn/mkldnn_quantized_act.cc    |    9 +-
 .../mkldnn/mkldnn_quantized_batch_norm.cc          |  100 +-
 .../quantization/mkldnn/mkldnn_quantized_concat.cc |   38 +-
 .../quantization/mkldnn/mkldnn_quantized_conv.cc   |   54 +-
 .../mkldnn/mkldnn_quantized_elemwise_add.cc        |  153 +-
 .../mkldnn/mkldnn_quantized_flatten.cc             |   24 +-
 .../mkldnn/mkldnn_quantized_fully_connected.cc     |   62 +-
 .../quantization/mkldnn/mkldnn_quantized_ops-inl.h |   11 +-
 .../mkldnn/mkldnn_quantized_pooling.cc             |   21 +-
 .../quantization/mkldnn/mkldnn_requantize-inl.h    |   62 +-
 src/operator/subgraph/mkldnn/mkldnn_common.h       |   57 +-
 src/operator/subgraph/mkldnn/mkldnn_conv-inl.h     |    5 +-
 src/operator/subgraph/mkldnn/mkldnn_conv.cc        |  534 +++---
 .../subgraph/mkldnn/mkldnn_conv_property.h         |  101 +-
 .../mkldnn_elemwisemul_post_quantize_property.h    |   80 +-
 src/operator/subgraph/mkldnn/mkldnn_fc-inl.h       |   21 +-
 src/operator/subgraph/mkldnn/mkldnn_fc.cc          |  594 +++----
 .../mkldnn/mkldnn_fc_post_quantize_property.h      |   78 +-
 src/operator/subgraph/mkldnn/mkldnn_fc_property.h  |   80 +-
 src/operator/subgraph/mkldnn/mkldnn_fc_sum_fuse.h  |  140 +-
 .../mkldnn_post_quantize_align_scale_property.h    |   97 +-
 .../mkldnn/mkldnn_post_quantize_property.h         |   48 +-
 .../subgraph/mkldnn/mkldnn_subgraph_property.cc    |   23 +-
 .../subgraph/mkldnn/mkldnn_transformer-inl.h       |   33 +-
 src/operator/subgraph/mkldnn/mkldnn_transformer.cc |  715 ++++----
 .../mkldnn_transformer_post_quantize_property.h    |   81 +-
 .../subgraph/mkldnn/mkldnn_transformer_property.h  |   59 +-
 src/operator/tensor/amp_cast.cc                    |  278 ++--
 src/operator/tensor/cast_storage-inl.h             |  222 +--
 tests/cpp/include/test_mkldnn.h                    |  262 +--
 tests/cpp/operator/mkldnn_operator_test.cc         |  829 +++++-----
 69 files changed, 7476 insertions(+), 6616 deletions(-)

diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index c55e49e..0febc65 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -26,23 +26,23 @@
 #define MXNET_NDARRAY_H_
 
 #include <dmlc/base.h>
-#include <dmlc/logging.h>
 #include <dmlc/io.h>
-#include <dmlc/type_traits.h>
+#include <dmlc/logging.h>
 #include <dmlc/registry.h>
+#include <dmlc/type_traits.h>
 #include <nnvm/node.h>
-#include <vector>
-#include <map>
-#include <string>
+
 #include <algorithm>
+#include <map>
 #include <memory>
-#include <algorithm>
+#include <string>
+#include <vector>
 #if MXNET_USE_MKLDNN == 1
 #include <mkldnn.hpp>
 #endif
 #include "./base.h"
-#include "./storage.h"
 #include "./engine.h"
+#include "./storage.h"
 // check c++11
 #if DMLC_USE_CXX11 == 0
 #error "cxx11 was required for ndarray module"
@@ -51,11 +51,11 @@
 namespace mxnet {
 // enum for storage types
 namespace csr {
-enum CSRAuxType {kIndPtr, kIdx};
+enum CSRAuxType { kIndPtr, kIdx };
 }
 
 namespace rowsparse {
-enum RowSparseAuxType {kIdx};
+enum RowSparseAuxType { kIdx };
 }
 
 enum NDArrayStorageType {
@@ -82,9 +82,7 @@ class MKLDNNMemory;
 class NDArray {
  public:
   /*! \brief default constructor */
-  NDArray()
-    : entry_(nullptr) {
-  }
+  NDArray() : entry_(nullptr) {}
   /*!
    * \brief constructs a new dynamic NDArray
    * \param shape the shape of array
@@ -92,20 +90,25 @@ class NDArray {
    * \param delay_alloc whether delay the allocation
    * \param dtype data type of this ndarray
    */
-  NDArray(const mxnet::TShape &shape, Context ctx,
-          bool delay_alloc = false, int dtype = mshadow::default_type_flag)
+  NDArray(const mxnet::TShape& shape,
+          Context ctx,
+          bool delay_alloc = false,
+          int dtype        = mshadow::default_type_flag)
       : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
         shape_(shape),
         dtype_(dtype),
         storage_type_(kDefaultStorage),
-        entry_(nullptr) {
-  }
+        entry_(nullptr) {}
   /*! \brief constructor for NDArray with storage type
    */
-  NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Context ctx,
-          bool delay_alloc = true, int dtype = mshadow::default_type_flag,
-          std::vector<int> aux_types = {}, mxnet::ShapeVector aux_shapes = {},
-          mxnet::TShape storage_shape = mxnet::TShape(mshadow::Shape1(0)));
+  NDArray(const NDArrayStorageType stype,
+          const mxnet::TShape& shape,
+          Context ctx,
+          bool delay_alloc              = true,
+          int dtype                     = mshadow::default_type_flag,
+          std::vector<int> aux_types    = {},
+          mxnet::ShapeVector aux_shapes = {},
+          mxnet::TShape storage_shape   = mxnet::TShape(mshadow::Shape1(0)));
   /*!
    * \brief constructs a new dynamic NDArray whose shape is unknown,
    *        hence the NDArray is inherently lazily created
@@ -117,8 +120,7 @@ class NDArray {
         shape_(),
         dtype_(dtype),
         storage_type_(kDefaultStorage),
-        entry_(nullptr) {
-  }
+        entry_(nullptr) {}
   /*!
    * \brief constructing a static NDArray that shares data with TBlob
    *  Use with caution: allocate ONLY ONE NDArray for each TBlob,
@@ -126,31 +128,31 @@ class NDArray {
    * \param data the memory content of static data
    * \param dev_id the device id this tensor sits at
    */
-  NDArray(const TBlob &data, int dev_id)
+  NDArray(const TBlob& data, int dev_id)
       : ptr_(std::make_shared<Chunk>(data, dev_id)),
         shape_(data.shape_),
         dtype_(data.type_flag_),
         storage_type_(kDefaultStorage),
-        entry_(nullptr) {
-  }
+        entry_(nullptr) {}
 
   /*!
-   * \brief constructing a static NDArray that shares data with TBlob which is with deleter
-   *  Use with caution: allocate ONLY ONE NDArray for each TBlob,
+   * \brief constructing a static NDArray that shares data with TBlob which is
+   * with deleter Use with caution: allocate ONLY ONE NDArray for each TBlob,
    *  make sure the memory region is available through out the life of NDArray
    * \param data the memory content of static data
    * \param dev_id the device id this tensor sits at
    * \param deleter the function pointer of custom deleter
    */
-  NDArray(const TBlob &data, int dev_id, const std::function<void()>& deleter)
-      : ptr_(new Chunk(data, dev_id), [deleter](Chunk *p) {
-            deleter();    // call custom deleter
-            delete p;     // delete Chunk object
-        }),
+  NDArray(const TBlob& data, int dev_id, const std::function<void()>& deleter)
+      : ptr_(new Chunk(data, dev_id),
+             [deleter](Chunk* p) {
+               deleter();  // call custom deleter
+               delete p;   // delete Chunk object
+             }),
         shape_(data.shape_),
-        dtype_(data.type_flag_), storage_type_(kDefaultStorage),
-        entry_(nullptr) {
-  }
+        dtype_(data.type_flag_),
+        storage_type_(kDefaultStorage),
+        entry_(nullptr) {}
 
   /*! \brief create ndarray from shared memory */
   NDArray(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
@@ -158,12 +160,11 @@ class NDArray {
         shape_(shape),
         dtype_(dtype),
         storage_type_(kDefaultStorage),
-        entry_(nullptr) {
-  }
+        entry_(nullptr) {}
 
   /*!
-   * \brief constructing a static NDArray of non-default storage that shares data with TBlob
-   *  Use with caution: allocate ONLY ONE NDArray for each TBlob,
+   * \brief constructing a static NDArray of non-default storage that shares
+   * data with TBlob Use with caution: allocate ONLY ONE NDArray for each TBlob,
    *  make sure the memory region is available through out the life of NDArray
    * \param stype the storage type of NDArray
    * \param shape the shape of NDArray
@@ -171,24 +172,27 @@ class NDArray {
    * \param aux_data the memory content of static aux data
    * \param dev_id the device id this tensor sits at
    */
-  NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape,
-          const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
+  NDArray(const NDArrayStorageType stype,
+          const mxnet::TShape& shape,
+          const TBlob& data,
+          const std::vector<TBlob>& aux_data,
+          int dev_id)
       : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)),
         shape_(shape),
         dtype_(data.type_flag_),
         storage_type_(stype),
-        entry_(nullptr) {
-  }
+        entry_(nullptr) {}
   /*!
-   * \brief initialize the NDArray, assuming it is not assigned a meaningful shape before
-   * \param shape the shape of the NDArray
+   * \brief initialize the NDArray, assuming it is not assigned a meaningful
+   * shape before \param shape the shape of the NDArray
    */
-  void Init(const mxnet::TShape &shape) {
+  void Init(const mxnet::TShape& shape) {
     ptr_->Init(shape, this->dtype_);
     this->shape_ = shape;
   }
   /*!
-   * \brief set the correct shape of NDArray directly from the storage_shape of its own chunk.
+   * \brief set the correct shape of NDArray directly from the storage_shape of
+   * its own chunk.
    */
   void SetShapeFromChunk();
   /*
@@ -210,10 +214,8 @@ class NDArray {
 
   /* \brief Check whether the two arrays are the same array */
   inline bool IsSame(const NDArray& other) const {
-    return ptr_ == other.ptr_ &&
-        shape_ == other.shape_ &&
-        byte_offset_ == other.byte_offset_ &&
-        dtype_ == other.dtype_;
+    return ptr_ == other.ptr_ && shape_ == other.shape_ && byte_offset_ == other.byte_offset_ &&
+           dtype_ == other.dtype_;
   }
 
   /*!
@@ -224,13 +226,13 @@ class NDArray {
   }
   /*!
    * \return the shape of underlying chunk which stores the NDArray data/value.
-   *  It is only intended for non-default storage. For row-sparse storage, it is the shape of
-   *  the tensor which stores the non-zero values.
+   *  It is only intended for non-default storage. For row-sparse storage, it is
+   * the shape of the tensor which stores the non-zero values.
    */
-  inline const mxnet::TShape &storage_shape() const {
+  inline const mxnet::TShape& storage_shape() const {
     CHECK(ptr_ != nullptr);
     CHECK_NE(storage_type(), kDefaultStorage)
-             << "storage_shape() is not intended for kDefaultStorage.";
+        << "storage_shape() is not intended for kDefaultStorage.";
     return ptr_->storage_shape;
   }
 
@@ -240,22 +242,20 @@ class NDArray {
    * \return the shape of aux data at given index
    */
   inline const mxnet::TShape& aux_shape(size_t index) const {
-    CHECK_NE(storage_type(), kDefaultStorage)
-             << "aux_shape() is not intended for kDefaultStorage.";
+    CHECK_NE(storage_type(), kDefaultStorage) << "aux_shape() is not intended for kDefaultStorage.";
     return ptr_->aux_shapes[index];
   }
 
   /* \return the shapes of all aux data */
   const mxnet::ShapeVector& aux_shapes() const {
     CHECK_NE(storage_type(), kDefaultStorage)
-             << "aux_shapes() is not intended for kDefaultStorage.";
+        << "aux_shapes() is not intended for kDefaultStorage.";
     return ptr_->aux_shapes;
   }
 
   /*! returns the dtypes of all aux data */
   const std::vector<int>& aux_types() const {
-    CHECK_NE(storage_type(), kDefaultStorage)
-      << "aux_types() is not intended for kDefaultStorage.";
+    CHECK_NE(storage_type(), kDefaultStorage) << "aux_types() is not intended for kDefaultStorage.";
     return ptr_->aux_types;
   }
 
@@ -268,7 +268,7 @@ class NDArray {
    */
   inline void set_aux_shape(size_t index, const mxnet::TShape& shape) const {
     CHECK_NE(storage_type(), kDefaultStorage)
-      << "set_aux_shape() is not intended for kDefaultStorage.";
+        << "set_aux_shape() is not intended for kDefaultStorage.";
     ptr_->set_aux_shape(index, shape);
   }
 
@@ -276,7 +276,8 @@ class NDArray {
    * \return the data TBlob
    */
   inline const TBlob& data() const {
-    if (storage_type() == kDefaultStorage) CheckAndAlloc();
+    if (storage_type() == kDefaultStorage)
+      CheckAndAlloc();
     SetTBlob();
     return tblob_;
   }
@@ -292,24 +293,26 @@ class NDArray {
     auto stype = storage_type();
     TBlob res;
     auto shape = aux_shape(i);
-    auto type = aux_type(i);
+    auto type  = aux_type(i);
     MSHADOW_TYPE_SWITCH(type, DType, {
       auto dptr = static_cast<DType*>(ptr_->aux_handles[i].dptr);
       CHECK(stype == kRowSparseStorage || stype == kCSRStorage)
-            << "Unexpected storage type: " << stype;
+          << "Unexpected storage type: " << stype;
       res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
     });
     return res;
   }
   /*!
-   * \return the context of NDArray, this function is only valid when the NDArray is not empty
+   * \return the context of NDArray, this function is only valid when the
+   * NDArray is not empty
    */
   inline Context ctx() const {
     CHECK(!is_none());
     return ptr_->shandle.ctx;
   }
   /*!
-   * \return the data type of NDArray, this function is only valid when the NDArray is not empty
+   * \return the data type of NDArray, this function is only valid when the
+   * NDArray is not empty
    */
   inline int dtype() const {
     return dtype_;
@@ -330,24 +333,25 @@ class NDArray {
   bool fresh_out_grad() const;
   /*! \return updated grad state in entry_ */
   void set_fresh_out_grad(bool state) const;
-  /*! \brief Returns true if a sparse ndarray's aux_data and storage are initialized
-   * Throws an exception if the indices array shape is inconsistent
+  /*! \brief Returns true if a sparse ndarray's aux_data and storage are
+   * initialized Throws an exception if the indices array shape is inconsistent
    * Returns false if the indices array is empty(nnz = 0) for csr/row_sparse
    */
   inline bool storage_initialized() const {
-    if (is_none()) return false;
+    if (is_none())
+      return false;
     auto stype = storage_type();
     CHECK_NE(stype, kDefaultStorage)
-             << "storage_initialized() is not intended for kDefaultStorage.";
+        << "storage_initialized() is not intended for kDefaultStorage.";
     if (stype == kRowSparseStorage) {
       CHECK_EQ(aux_shape(rowsparse::kIdx)[0], storage_shape()[0])
-               << "inconsistent storage shape " << storage_shape()
-               << " vs. aux shape " << aux_shape(rowsparse::kIdx);
+          << "inconsistent storage shape " << storage_shape() << " vs. aux shape "
+          << aux_shape(rowsparse::kIdx);
       return aux_shape(rowsparse::kIdx).Size() != 0;
     } else if (stype == kCSRStorage) {
       CHECK_EQ(aux_shape(csr::kIdx)[0], storage_shape()[0])
-               << "inconsistent storage shape " << storage_shape()
-               << " vs. aux shape " << aux_shape(csr::kIdx);
+          << "inconsistent storage shape " << storage_shape() << " vs. aux shape "
+          << aux_shape(csr::kIdx);
       return aux_shape(csr::kIdx).Size() != 0;
     } else {
       LOG(FATAL) << "Unknown storage type";
@@ -366,7 +370,8 @@ class NDArray {
    *    to current NDArray are finished, and read can be performed.
    */
   inline void WaitToRead() const {
-    if (is_none()) return;
+    if (is_none())
+      return;
     Engine::Get()->WaitForVar(ptr_->var);
   }
   /*!
@@ -374,15 +379,17 @@ class NDArray {
    *    to current NDArray are finished, and write can be performed.
    */
   inline void WaitToWrite() const {
-    if (is_none()) return;
+    if (is_none())
+      return;
     /*!
      * Push an empty mutable function to flush all preceding reads to the
      * variable.
      */
     Engine::Get()->PushAsync(
-      [](RunContext, Engine::CallbackOnComplete on_complete) {
-        on_complete();
-      }, Context{}, {}, {ptr_->var});
+        [](RunContext, Engine::CallbackOnComplete on_complete) { on_complete(); },
+        Context{},
+        {},
+        {ptr_->var});
     Engine::Get()->WaitForVar(ptr_->var);
   }
   /*! \return the associated variable of the ndarray.*/
@@ -401,81 +408,81 @@ class NDArray {
    * \brief save the content into binary stream
    * \param strm the output stream
    */
-  void Save(dmlc::Stream *strm) const;
+  void Save(dmlc::Stream* strm) const;
   /*!
    * \brief load ndarrays before supporting sparse ndarrays
    * \param strm the output stream
    * \param magic the magic number used for version control
    */
-  bool LegacyLoad(dmlc::Stream *strm, const uint32_t magic);
+  bool LegacyLoad(dmlc::Stream* strm, const uint32_t magic);
   /*!
    * \brief load the content from binary stream
    * \param strm the output stream
    * \return whether the load is successful
    */
-  bool Load(dmlc::Stream *strm);
+  bool Load(dmlc::Stream* strm);
   /*!
    * \brief set all the elements in ndarray to be scalar
    * \param scalar the scalar to set
    * \return reference of self
    */
-  NDArray &operator=(real_t scalar);
+  NDArray& operator=(real_t scalar);
   /*!
    * \brief elementwise add to current space
    *  this mutate the current NDArray
    * \param src the data to add
    * \return reference of self
    */
-  NDArray &operator+=(const NDArray &src);
+  NDArray& operator+=(const NDArray& src);
   /*!
    * \brief elementwise add to current space
    *  this mutate the current NDArray
    * \param src the data to add
    * \return reference of self
    */
-  NDArray &operator+=(const real_t &src);
+  NDArray& operator+=(const real_t& src);
   /*!
    * \brief elementwise subtract from current ndarray
    * this mutate the current NDArray
    * \param src the data to subtract
    * \return reference of self
    */
-  NDArray &operator-=(const NDArray &src);
+  NDArray& operator-=(const NDArray& src);
   /*!
    * \brief elementwise subtract from current ndarray
    * this mutate the current NDArray
    * \param src the data to subtract
    * \return reference of self
    */
-  NDArray &operator-=(const real_t &src);
+  NDArray& operator-=(const real_t& src);
   /*!
    * \brief elementwise multiplication to current ndarray
    *  this mutate the current NDArray
    * \param src the data to subtract
    * \return reference of self
    */
-  NDArray &operator*=(const NDArray &src);
+  NDArray& operator*=(const NDArray& src);
   /*!
    * \brief elementwise multiplication to current ndarray
    *  this mutate the current NDArray
    * \param src the data to subtract
    * \return reference of self
    */
-  NDArray &operator*=(const real_t &src);
+  NDArray& operator*=(const real_t& src);
   /*!
    * \brief elementwise division from current ndarray
    *  this mutate the current NDArray
    * \param src the data to subtract
    * \return reference of self
    */
-  NDArray &operator/=(const NDArray &src);
+  NDArray& operator/=(const NDArray& src);
   /*!
    * \brief elementwise division from current ndarray
    *  this mutate the current NDArray
    * \param src the data to subtract
    * \return reference of self
    */
-  NDArray &operator/=(const real_t &src);
+  NDArray& operator/=(const real_t& src);
   /*!
    * \brief return a new copy this NDArray
    * \param ctx the new context of this NDArray
@@ -492,12 +499,12 @@ class NDArray {
    * \param data the data source to copy from.
    * \param size the size of the source array, in sizeof(DType) not raw btyes.
    */
-  void SyncCopyFromCPU(const void *data, size_t size) const;
+  void SyncCopyFromCPU(const void* data, size_t size) const;
 
   /*!
    * \brief Copy from src.data()/aux_data(i) to this->data()/aux_data(j)
    */
-  void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1);
+  void SyncCopyFromNDArray(const NDArray& src, int i = -1, int j = -1);
 
   /*!
    * \brief Do a synchronize copy to a contiguous CPU memory region.
@@ -507,14 +514,15 @@ class NDArray {
    *  not wrapped by NDArray(thus dependency not being tracked).
    *
    * \param data the data source to copyinto.
-   * \param size the memory size we want to copy into, in sizeof(DType) not raw btyes.
+   * \param size the memory size we want to copy into, in sizeof(DType) not raw
+   * btyes.
    */
-  void SyncCopyToCPU(void *data, size_t size) const;
+  void SyncCopyToCPU(void* data, size_t size) const;
   /*!
-  * \brief check whether the NDArray format is valid
-  * \param full_check if `True`, rigorous check, O(N) operations
-  *    Otherwise basic check, O(1) operations
-  */
+   * \brief check whether the NDArray format is valid
+   * \param full_check if `True`, rigorous check, O(N) operations
+   *    Otherwise basic check, O(1) operations
+   */
   void SyncCheckFormat(const bool full_check) const;
   /*!
    * \brief Slice a NDArray
@@ -561,18 +569,16 @@ class NDArray {
    * \param dtype The data type.
    * \return NDArray in new shape and type.
    */
-  inline NDArray AsArray(const mxnet::TShape &shape, int dtype) const {
-    CHECK_EQ(storage_type(), kDefaultStorage)
-             << "AsArray is intended only for kDefaultStorage.";
-    CHECK_GE(ptr_->shandle.size,
-             shape.Size() * mshadow::mshadow_sizeof(dtype))
+  inline NDArray AsArray(const mxnet::TShape& shape, int dtype) const {
+    CHECK_EQ(storage_type(), kDefaultStorage) << "AsArray is intended only for kDefaultStorage.";
+    CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype))
         << "NDArray.AsArray: target memory size is bigger";
     // We can't reuse memory in a view.
     CHECK(!IsView());
     NDArray ret = *this;
-    ret.shape_ = shape;
-    ret.dtype_ = dtype;
-    ret.reuse_ = true;
+    ret.shape_  = shape;
+    ret.dtype_  = dtype;
+    ret.reuse_  = true;
     return ret;
   }
 
@@ -597,13 +603,13 @@ class NDArray {
   static NDArray FromDLPack(const DLManagedTensor* tensor, bool transient_handle);
 
   /*!
-   * \brief Update ndarray chunk storage handles using existing ndarray storage handles
-   * Also update the aux_handle, aux_shapes and aux_types.
-   * This is specifically used for custom op to update the inputs and outputs from
-   * the temporary ndarray which stores intermediate custom op results.
-   * Should be used with caution elsewhere. Supports only CSR and RSP formats.
+   * \brief Update ndarray chunk storage handles using existing ndarray storage
+   * handles Also update the aux_handle, aux_shapes and aux_types. This is
+   * specifically used for custom op to update the inputs and outputs from the
+   * temporary ndarray which stores intermediate custom op results. Should be
+   * used with caution elsewhere. Supports only CSR and RSP formats.
    */
-  inline void SparseUpdateChunk(const NDArray &arr) const {
+  inline void SparseUpdateChunk(const NDArray& arr) const {
     CHECK(shape_ == arr.shape_) << "ndarray shape is different from the target";
     CHECK(dtype_ == arr.dtype_) << "ndarray dtype is different from the target";
     auto stype = arr.storage_type();
@@ -611,24 +617,24 @@ class NDArray {
         << "Only to be used with CSR and RSP storage types";
     // swap shandles between src and dst
     Storage::Handle shandle_dst = arr.ptr_->shandle;
-    arr.ptr_->shandle = ptr_->shandle;
-    ptr_->shandle = shandle_dst;
+    arr.ptr_->shandle           = ptr_->shandle;
+    ptr_->shandle               = shandle_dst;
 
     ptr_->storage_shape = arr.ptr_->storage_shape;
-    ptr_->storage_type = arr.ptr_->storage_type;
-    ptr_->ctx = arr.ptr_->ctx;
+    ptr_->storage_type  = arr.ptr_->storage_type;
+    ptr_->ctx           = arr.ptr_->ctx;
 
     // swap aux_handles between src and dst
     size_t aux_idx = 0;
     CHECK(ptr_->aux_handles.size() == arr.ptr_->aux_handles.size())
         << "ndarray number of aux_handles is different from target";
-    for (auto &aux_handle : arr.ptr_->aux_handles) {
-      Storage::Handle aux_dst = ptr_->aux_handles[aux_idx];
+    for (auto& aux_handle : arr.ptr_->aux_handles) {
+      Storage::Handle aux_dst    = ptr_->aux_handles[aux_idx];
       ptr_->aux_handles[aux_idx] = aux_handle;
-      aux_handle = aux_dst;
+      aux_handle                 = aux_dst;
       aux_idx++;
     }
-    ptr_->aux_types = arr.ptr_->aux_types;
+    ptr_->aux_types  = arr.ptr_->aux_types;
     ptr_->aux_shapes = arr.ptr_->aux_shapes;
   }
 
@@ -637,13 +643,13 @@ class NDArray {
    * \param shape new shape
    * \return NDArray in new shape
    */
-  NDArray Reshape(const mxnet::TShape &shape) const;
+  NDArray Reshape(const mxnet::TShape& shape) const;
   /*!
    * \brief Get an reshaped NDArray. Supports autograd recording
    * \param shape new shape
    * \return NDArray in new shape
    */
-  NDArray ReshapeWithRecord(const mxnet::TShape &shape);
+  NDArray ReshapeWithRecord(const mxnet::TShape& shape);
   /*!
    * \brief Return a copy of this NDArray without autograd history
    */
@@ -670,7 +676,8 @@ class NDArray {
    * storage type and effectively changes the ndarray's shape_.
    * Note: This function is named as this to avoid overload conflict
    * with CheckAndAlloc(const mxnet::ShapeVector &aux_shapes), since
-   * mxnet::TShape tmp = some_shape is equivalent to mxnet::TShape tmp = {some_shape}.
+   * mxnet::TShape tmp = some_shape is equivalent to mxnet::TShape tmp =
+   * {some_shape}.
    */
   void ReshapeAndAlloc(const mxnet::TShape& shape) {
     CHECK_EQ(storage_type(), kDefaultStorage);
@@ -683,19 +690,19 @@ class NDArray {
    * \brief Alloc memory for non-default storage
    * aux_shape is only known at run time
    */
-  inline void CheckAndAlloc(const mxnet::ShapeVector &aux_shapes) const {
+  inline void CheckAndAlloc(const mxnet::ShapeVector& aux_shapes) const {
     CHECK_NE(storage_type(), kDefaultStorage)
-             << "CheckAndAlloc(aux_shapes) is not intended for kDefaultStorage";
+        << "CheckAndAlloc(aux_shapes) is not intended for kDefaultStorage";
     ptr_->CheckAndAlloc(shape_, aux_shapes, dtype_);
   }
-  inline void CheckAndAllocData(const mxnet::TShape &storage_shape) const {
+  inline void CheckAndAllocData(const mxnet::TShape& storage_shape) const {
     CHECK_NE(storage_type(), kDefaultStorage)
-             << "CheckAndAllocData is not intended for kDefaultStorage";
+        << "CheckAndAllocData is not intended for kDefaultStorage";
     ptr_->CheckAndAllocData(storage_shape, dtype_);
   }
-  inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape &aux_shape) const {
+  inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape& aux_shape) const {
     CHECK_NE(storage_type(), kDefaultStorage)
-             << "CheckAndAllocAuxData is not intended for kDefaultStorage";
+        << "CheckAndAllocAuxData is not intended for kDefaultStorage";
     ptr_->CheckAndAllocAuxData(i, aux_shape);
   }
 
@@ -704,12 +711,12 @@ class NDArray {
    * Create NDArray from mkldnn memory.
    * mkldnn_mem The mkldnn memory to be managed.
    */
-  explicit NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem);
+  explicit NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem);
   /*
    * Create NDArray from mkldnn memory descriptor.
    * mem_pd The mkldnn memory descriptor to be created.
    */
-  explicit NDArray(const mkldnn::memory::desc &md);
+  explicit NDArray(const mkldnn::memory::desc& md);
   /*
    * Test if the data is stored in one of special MKLDNN format.
    */
@@ -732,29 +739,29 @@ class NDArray {
   /*
    * This function returns mkldnn::memory with the default primitive_desc.
    */
-  const mkldnn::memory *GetMKLDNNData() const;
+  const mkldnn::memory* GetMKLDNNData() const;
   /*
    * This function returns mkldnn::memory with the given primitive_desc
-   * as long as the array size meets the required size in the given primitive_desc.
+   * as long as the array size meets the required size in the given
+   * primitive_desc.
    */
-  const mkldnn::memory *GetMKLDNNData(const mkldnn::memory::desc &md) const;
+  const mkldnn::memory* GetMKLDNNData(const mkldnn::memory::desc& md) const;
   /*
    * This function returns mkldnn::memory with the given primitive_desc.
    * The returned mkldnn::memory will have the same physical layout as
    * the given primitive_desc.
    */
-  const mkldnn::memory *GetMKLDNNDataReorder(
-      const mkldnn::memory::desc &md) const;
+  const mkldnn::memory* GetMKLDNNDataReorder(const mkldnn::memory::desc& md) const;
 
   /*
    * This function copies data from mkldnn memory.
    */
-  void CopyFrom(const mkldnn::memory &mem);
+  void CopyFrom(const mkldnn::memory& mem);
   /*
    * This function allocates memory for array and creates mkldnn memory
    * with the specified format.
    */
-  mkldnn::memory *CreateMKLDNNData(const mkldnn::memory::desc &md);
+  mkldnn::memory* CreateMKLDNNData(const mkldnn::memory::desc& md);
 
   /*
    * These are the async version of the methods above.
@@ -762,7 +769,7 @@ class NDArray {
    * the array are complete.
    */
   void Reorder2DefaultAsync() const;
-  void MKLDNNDataReorderAsync(const mkldnn::memory::desc &md) const;
+  void MKLDNNDataReorderAsync(const mkldnn::memory::desc& md) const;
 
   /*
    * This creates a new NDArray with the reordered data.
@@ -770,7 +777,7 @@ class NDArray {
    */
   NDArray Reorder2Default() const;
 
-    /*
+  /*
    * This creates a new NDArray using f32 with the reordered data.
    * It doesn't affect the data of the original NDArray.
    */
@@ -788,12 +795,12 @@ class NDArray {
    * which can be expensive.
    * It's used by FullyConnected right now.
    */
-  NDArray MKLDNNDataReshape(const mxnet::TShape &shape) const;
+  NDArray MKLDNNDataReshape(const mxnet::TShape& shape) const;
 
-   /*!
+  /*!
    * \ Fix mkldnn memory descriptor mismatch from NDArray.
    */
-  void UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc);
+  void UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc);
 #endif
 
   /*!
@@ -811,15 +818,14 @@ class NDArray {
    * \param data the NDArrays to be loaded
    * \param keys the name of the NDArray, if saved in the file.
    */
-  static void Load(dmlc::Stream* fi,
-                   std::vector<NDArray>* data,
-                   std::vector<std::string>* keys);
+  static void Load(dmlc::Stream* fi, std::vector<NDArray>* data, std::vector<std::string>* keys);
 
  private:
   friend class Imperative;
   /*! \brief the real data chunk that backs NDArray */
   // shandle is used to store the actual values in the NDArray
-  // aux_handles store the aux data(such as indices) if it's needed by non-default storage.
+  // aux_handles store the aux data(such as indices) if it's needed by
+  // non-default storage.
   struct Chunk {
     /*! \brief storage handle from storage engine.
                for non-default storage, shandle stores the data(value) array.
@@ -844,52 +850,58 @@ class NDArray {
      */
     /*! \brief construct from static data */
     bool static_data;
-    /*! \brief whether data allocation is delayed. This doesn't indicate whether aux data
-               allocation is delayed. */
+    /*! \brief whether data allocation is delayed. This doesn't indicate whether
+       aux data allocation is delayed. */
     bool delay_alloc;
-    // the type of the storage. The storage_type is never kUndefinedStorage once the chunk
-    // is constructed.
+    // the type of the storage. The storage_type is never kUndefinedStorage once
+    // the chunk is constructed.
     NDArrayStorageType storage_type = kDefaultStorage;
     /*! \brief type of aux */
     std::vector<int> aux_types;
     // context of data
     Context ctx;
     // The shape of the chunk data.
-    // This might not be the same shape as the NDArray, since the storage may be sparse.
-    // The default value for storage_shape is {0} when an empty non-default NDArray is created.
+    // This might not be the same shape as the NDArray, since the storage may be
+    // sparse. The default value for storage_shape is {0} when an empty
+    // non-default NDArray is created.
     mxnet::TShape storage_shape;
-    // The shape of aux data. The default value for the shape depends on the type of storage.
-    // If aux_shapes[i].Size() is zero, aux data i is empty.
+    // The shape of aux data. The default value for the shape depends on the
+    // type of storage. If aux_shapes[i].Size() is zero, aux data i is empty.
     mxnet::ShapeVector aux_shapes;
     /*! \brief Reference to the storage to ensure proper destruct order */
     std::shared_ptr<Storage> storage_ref_;
-    /*! \brief Reference to the engine to ensure we cleanup without calling a destructed engine */
+    /*! \brief Reference to the engine to ensure we cleanup without calling a
+     * destructed engine */
     std::weak_ptr<Engine> engine_ref_;
 
-
     /*! \brief default constructor */
-    Chunk() : static_data(true), delay_alloc(false),
-              storage_ref_(Storage::_GetSharedRef()),
-              engine_ref_(Engine::_GetSharedRef()) {}
+    Chunk()
+        : static_data(true),
+          delay_alloc(false),
+          storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {}
 
     /*! \brief construct a new chunk */
     Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
-        : static_data(false), delay_alloc(true), ctx(ctx_),
+        : static_data(false),
+          delay_alloc(true),
+          ctx(ctx_),
           storage_ref_(Storage::_GetSharedRef()),
           engine_ref_(Engine::_GetSharedRef()) {
       storage_shape = shape;
       if (shape_is_known(storage_shape)) {
         shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
       }
-      var = Engine::Get()->NewVariable();
+      var         = Engine::Get()->NewVariable();
       shandle.ctx = ctx_;
       if (!delay_alloc_) {
         this->CheckAndAlloc();
       }
     }
 
-    Chunk(const TBlob &data, int dev_id)
-        : static_data(true), delay_alloc(false),
+    Chunk(const TBlob& data, int dev_id)
+        : static_data(true),
+          delay_alloc(false),
           storage_ref_(Storage::_GetSharedRef()),
           engine_ref_(Engine::_GetSharedRef()) {
       CHECK(storage_type == kDefaultStorage);
@@ -901,35 +913,45 @@ class NDArray {
         ctx = Context::GPU(dev_id);
       }
       // init shandle
-      shandle.ctx = ctx;
-      shandle.dptr = data.dptr_;
-      shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
+      shandle.ctx   = ctx;
+      shandle.dptr  = data.dptr_;
+      shandle.size  = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
       storage_shape = data.shape_;
     }
 
     Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
-        : static_data(false), delay_alloc(false),
+        : static_data(false),
+          delay_alloc(false),
           storage_ref_(Storage::_GetSharedRef()),
           engine_ref_(Engine::_GetSharedRef()) {
-      var = Engine::Get()->NewVariable();
-      ctx = Context::CPUShared(0);
-      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
-      shandle.ctx = ctx;
+      var                = Engine::Get()->NewVariable();
+      ctx                = Context::CPUShared(0);
+      shandle.size       = shape.Size() * mshadow::mshadow_sizeof(dtype);
+      shandle.ctx        = ctx;
       shandle.shared_pid = shared_pid;
-      shandle.shared_id = shared_id;
+      shandle.shared_id  = shared_id;
       Storage::Get()->Alloc(&shandle);
       storage_shape = shape;
     }
     // Constructor for a non-default storage chunk
-    Chunk(NDArrayStorageType storage_type_, const mxnet::TShape &storage_shape_, Context ctx_,
-          bool delay_alloc_, int dtype, const std::vector<int> &aux_types_,
-          const mxnet::ShapeVector &aux_shapes_)
-        : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
-          aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
-          aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()),
+    Chunk(NDArrayStorageType storage_type_,
+          const mxnet::TShape& storage_shape_,
+          Context ctx_,
+          bool delay_alloc_,
+          int dtype,
+          const std::vector<int>& aux_types_,
+          const mxnet::ShapeVector& aux_shapes_)
+        : static_data(false),
+          delay_alloc(delay_alloc_),
+          storage_type(storage_type_),
+          aux_types(aux_types_),
+          ctx(ctx_),
+          storage_shape(storage_shape_),
+          aux_shapes(aux_shapes_),
+          storage_ref_(Storage::_GetSharedRef()),
           engine_ref_(Engine::_GetSharedRef()) {
       shandle.ctx = ctx;
-      var = Engine::Get()->NewVariable();
+      var         = Engine::Get()->NewVariable();
       // aux_handles always reflect the correct number of aux data
       for (size_t i = 0; i < aux_shapes.size(); i++) {
         CheckAndAllocAuxData(i, aux_shapes[i]);
@@ -942,10 +964,15 @@ class NDArray {
       }
     }
 
-    Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
-          const std::vector<TBlob> &aux_data, int dev_id)
-        : static_data(true), delay_alloc(false), storage_type(storage_type_),
-          storage_ref_(Storage::_GetSharedRef()), engine_ref_(Engine::_GetSharedRef()) {
+    Chunk(const NDArrayStorageType storage_type_,
+          const TBlob& data,
+          const std::vector<TBlob>& aux_data,
+          int dev_id)
+        : static_data(true),
+          delay_alloc(false),
+          storage_type(storage_type_),
+          storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       using namespace mshadow;
       CHECK_NE(storage_type, kDefaultStorage);
       // init var
@@ -958,14 +985,14 @@ class NDArray {
         ctx = Context::GPU(dev_id);
       }
       // init shandle
-      shandle.ctx = ctx;
-      shandle.dptr = data.dptr_;
-      shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
+      shandle.ctx   = ctx;
+      shandle.dptr  = data.dptr_;
+      shandle.size  = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
       storage_shape = data.shape_;
       // init aux handles
-      for (const auto &aux : aux_data) {
+      for (const auto& aux : aux_data) {
         Storage::Handle aux_handle;
-        aux_handle.ctx = ctx;
+        aux_handle.ctx  = ctx;
         aux_handle.dptr = aux.dptr_;
         aux_handle.size = aux.shape_.Size() * mshadow_sizeof(aux.type_flag_);
         aux_handles.push_back(aux_handle);
@@ -974,7 +1001,8 @@ class NDArray {
       }
     }
 
-    /*! \brief set the shape for ith aux data, and update storage shape if necessary */
+    /*! \brief set the shape for ith aux data, and update storage shape if
+     * necessary */
     inline void set_aux_shape(const size_t i, const mxnet::TShape& shape) {
       aux_shapes[i] = shape;
       if (storage_shape.ndim() >= 0) {
@@ -1019,14 +1047,16 @@ class NDArray {
 #endif
       }
     }
-    /*! \brief initialize the shape and dtype, assuming it is not initialized before. */
-    void Init(const mxnet::TShape &shape, int dtype) {
-      auto size = shape.Size();
+    /*! \brief initialize the shape and dtype, assuming it is not initialized
+     * before. */
+    void Init(const mxnet::TShape& shape, int dtype) {
+      auto size     = shape.Size();
       storage_shape = shape;
-      shandle.size = size * mshadow::mshadow_sizeof(dtype);
+      shandle.size  = size * mshadow::mshadow_sizeof(dtype);
       this->CheckAndAlloc();
     }
-    inline void CheckAndAlloc(const mxnet::TShape &shape, const mxnet::ShapeVector &aux_shapes,
+    inline void CheckAndAlloc(const mxnet::TShape& shape,
+                              const mxnet::ShapeVector& aux_shapes,
                               int dtype) {
       // calculate size, perform allocation
       if (kRowSparseStorage == storage_type) {
@@ -1044,21 +1074,20 @@ class NDArray {
         LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc";
       }
     }
-    // create storage handle for data based on shape and dtype, assuming ctx is set
-    // storage shape is also updated
-    // if data is already allocated, try reuse the storage. Otherwise, free the current one
-    // and allocate new storage
-    void CheckAndAllocData(const mxnet::TShape &shape, int dtype);
+    // create storage handle for data based on shape and dtype, assuming ctx is
+    // set storage shape is also updated if data is already allocated, try reuse
+    // the storage. Otherwise, free the current one and allocate new storage
+    void CheckAndAllocData(const mxnet::TShape& shape, int dtype);
 
 #if MXNET_USE_MKLDNN == 1
     // Have MKL memory reference to the data in the default storage
     // or create memory for MKLDNN.
-    void SetMKLMem(const mxnet::TShape &shape, int dtype);
+    void SetMKLMem(const mxnet::TShape& shape, int dtype);
     // If the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
     // save the result in shandle.
     void Reorder2Default();
     // Reroder data to a specified layout.
-    void MKLDNNDataReorder(const mkldnn::memory::desc &md);
+    void MKLDNNDataReorder(const mkldnn::memory::desc& md);
     bool IsMKLDNN() const;
     bool IsDefault() const;
 #endif
@@ -1066,14 +1095,14 @@ class NDArray {
     // create storage handle for aux data based on shape
     // this function assumes ctx, aux shapes and aux types are set
     // aux shape is also updated
-    // if aux data is already allocated, try reuse the storage. Otherwise, free the current one
-    // and allocate new storage
-    inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape &shape) {
+    // if aux data is already allocated, try reuse the storage. Otherwise, free
+    // the current one and allocate new storage
+    inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape& shape) {
       CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData";
       CHECK_NE(storage_type, kUndefinedStorage)
-        << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
+          << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
       CHECK_NE(storage_type, kDefaultStorage)
-        << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
+          << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
       if (aux_handles.size() <= i) {
         aux_handles.resize(i + 1);
       }
@@ -1133,7 +1162,7 @@ size_t num_aux_data(NDArrayStorageType stype);
  * \note The function name explicitly marks the order of from and to
  *     due to different possible convention carried by copy function.
  */
-void CopyFromTo(const NDArray &from, const NDArray *to, int priority = 0);
+void CopyFromTo(const NDArray& from, const NDArray* to, int priority = 0);
 
 /*!
  * \brief issue an copy operation from one NDArray to another
@@ -1143,20 +1172,19 @@ void CopyFromTo(const NDArray &from, const NDArray *to, int priority = 0);
  * \param from the ndarray we want to copy data from
  * \param to the target ndarray
  * \param priority Priority of the action.
- * \param is_opr whether it is invoked by an operator. For example, false if invoked from
-       KVStore, true if invoked from `_copyto` operator.
+ * \param is_opr whether it is invoked by an operator. For example, false if
+ invoked from KVStore, true if invoked from `_copyto` operator.
  * \note The function name explicitly marks the order of from and to
  *     due to different possible convention carried by copy function.
  */
-void CopyFromTo(const NDArray &from, const NDArray& to, int priority = 0, bool is_opr = false);
+void CopyFromTo(const NDArray& from, const NDArray& to, int priority = 0, bool is_opr = false);
 
 /*!
- * \brief Perform elementwise sum over each data from source, store result into out.
- * \param source the ndarray we want to sum
- * \param out the target ndarray
+ * \brief Perform elementwise sum over each data from source, store result into
+ * out. \param source the ndarray we want to sum \param out the target ndarray
  * \param priority Priority of the action.
  */
-void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority = 0);
+void ElementwiseSum(const std::vector<NDArray>& source, NDArray* out, int priority = 0);
 
 /*!
  * \brief elementwise add
@@ -1164,56 +1192,56 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
  * \param rhs right operand
  * \return a new result ndarray
  */
-NDArray operator+(const NDArray &lhs, const NDArray &rhs);
+NDArray operator+(const NDArray& lhs, const NDArray& rhs);
 /*!
  * \brief elementwise add
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-NDArray operator+(const NDArray &lhs, const real_t &rhs);
+NDArray operator+(const NDArray& lhs, const real_t& rhs);
 /*!
  * \brief elementwise subtraction
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-NDArray operator-(const NDArray &lhs, const NDArray &rhs);
+NDArray operator-(const NDArray& lhs, const NDArray& rhs);
 /*!
  * \brief elementwise subtraction
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-NDArray operator-(const NDArray &lhs, const real_t &rhs);
+NDArray operator-(const NDArray& lhs, const real_t& rhs);
 /*!
  * \brief elementwise multiplication
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-NDArray operator*(const NDArray &lhs, const NDArray &rhs); \
+NDArray operator*(const NDArray& lhs, const NDArray& rhs);
 /*!
  * \brief elementwise multiplication
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-NDArray operator*(const NDArray &lhs, const real_t &rhs);
+NDArray operator*(const NDArray& lhs, const real_t& rhs);
 /*!
  * \brief elementwise division
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-NDArray operator/(const NDArray &lhs, const NDArray &rhs);
+NDArray operator/(const NDArray& lhs, const NDArray& rhs);
 /*!
  * \brief elementwise division
  * \param lhs left operand
  * \param rhs right operand
  * \return a new result ndarray
  */
-NDArray operator/(const NDArray &lhs, const real_t &rhs);
+NDArray operator/(const NDArray& lhs, const real_t& rhs);
 
 /*!
  * \brief Seed all random number generator in mxnet.
@@ -1231,60 +1259,59 @@ void RandomSeed(Context ctx, uint32_t seed);
  * \param end upper bound of distribution.
  * \param out output NDArray.
  */
-void SampleUniform(real_t begin, real_t end, NDArray *out);
+void SampleUniform(real_t begin, real_t end, NDArray* out);
 /*!
  * \brief Sample gaussian distribution for each elements of out.
  * \param mu mean of gaussian distribution.
  * \param sigma standard deviation of gaussian distribution.
  * \param out output NDArray.
  */
-void SampleGaussian(real_t mu, real_t sigma, NDArray *out);
+void SampleGaussian(real_t mu, real_t sigma, NDArray* out);
 /*!
  * \brief Sample gamma distribution for each elements of out.
  * \param alpha parameter (shape) of the gamma distribution
  * \param beta parameter (scale) of the gamma distribution
  * \param out output NDArray.
  */
-void SampleGamma(real_t alpha, real_t beta, NDArray *out);
+void SampleGamma(real_t alpha, real_t beta, NDArray* out);
 /*!
  * \brief Sample exponential distribution for each elements of out.
  * \param lambda parameter (rate) of the exponential distribution
  * \param out output NDArray.
  */
-void SampleExponential(real_t lambda, NDArray *out);
+void SampleExponential(real_t lambda, NDArray* out);
 /*!
  * \brief Sample Poisson distribution for each elements of out.
  * \param lambda parameter (rate) of the Poisson distribution
  * \param out output NDArray.
  */
-void SamplePoisson(real_t lambda, NDArray *out);
+void SamplePoisson(real_t lambda, NDArray* out);
 /*!
  * \brief Sample negative binomial distribution for each elements of out.
  * \param k failure limit
  * \param p success probability
  * \param out output NDArray.
  */
-void SampleNegBinomial(int32_t k, real_t p, NDArray *out);
+void SampleNegBinomial(int32_t k, real_t p, NDArray* out);
 /*!
- * \brief Sample generalized negative binomial distribution for each elements of out.
- * \param mu parameter (mean) of the distribution
- * \param alpha parameter (over dispersion) of the distribution
- * \param out output NDArray.
+ * \brief Sample generalized negative binomial distribution for each elements of
+ * out. \param mu parameter (mean) of the distribution \param alpha parameter
+ * (over dispersion) of the distribution \param out output NDArray.
  */
-void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray *out);
-
+void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray* out);
 
 //--------------------------------------------------------------
 // The following part are API Registration of NDArray functions.
 //--------------------------------------------------------------
 
 /*! \brief definition of NDArray function */
-typedef std::function<void (NDArray **used_vars,
-                            real_t *scalars,
-                            NDArray **mutate_vars,
-                            int num_params,
-                            char **param_keys,
-                            char **param_vals)> NDArrayAPIFunction;
+typedef std::function<void(NDArray** used_vars,
+                           real_t* scalars,
+                           NDArray** mutate_vars,
+                           int num_params,
+                           char** param_keys,
+                           char** param_vals)>
+    NDArrayAPIFunction;
 /*! \brief mask information on how functions can be exposed */
 enum NDArrayFunctionTypeMask {
   /*! \brief all the use_vars should go before scalar */
@@ -1303,8 +1330,7 @@ enum NDArrayFunctionTypeMask {
 };
 /*! \brief Registry entry for NDArrayFunction */
 struct NDArrayFunctionReg
-    : public dmlc::FunctionRegEntryBase<NDArrayFunctionReg,
-                                        NDArrayAPIFunction> {
+    : public dmlc::FunctionRegEntryBase<NDArrayFunctionReg, NDArrayAPIFunction> {
   /*! \brief number of variable used by this function */
   unsigned num_use_vars;
   /*! \brief number of variable mutated by this function */
@@ -1316,44 +1342,45 @@ struct NDArrayFunctionReg
   /*!
    * \brief constructor
    */
-  NDArrayFunctionReg()
-      : num_use_vars(0),
-        num_mutate_vars(0),
-        num_scalars(0),
-        type_mask(0) {}
+  NDArrayFunctionReg() : num_use_vars(0), num_mutate_vars(0), num_scalars(0), type_mask(0) {}
   /*!
    * \brief set the function body to a NDArray setvalue function
    *  this will also auto set the parameters correctly
    * \param fsetvalue function body to set
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_function(void (*fsetvalue)(const real_t &rhs,
-                                                            NDArray *out)) {
-    body = [fsetvalue] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
-                        int num_params, char **param_keys, char **param_vals) {
-      (*fsetvalue)(s[0], mutate_vars[0]);
-    };
-    num_mutate_vars = 1; num_scalars = 1;
+  inline NDArrayFunctionReg& set_function(void (*fsetvalue)(const real_t& rhs, NDArray* out)) {
+    body = [fsetvalue](NDArray** used_vars,
+                       real_t* s,
+                       NDArray** mutate_vars,
+                       int num_params,
+                       char** param_keys,
+                       char** param_vals) { (*fsetvalue)(s[0], mutate_vars[0]); };
+
+    num_mutate_vars = 1;
+    num_scalars     = 1;
     this->add_argument("src", "real_t", "Source input to the function.");
     return *this;
   }
   /*!
-  * \brief set the function body to a ternary NDArray function
-  *  this will also auto set the parameters correctly
-  * \param fternary function body to set
-  * \return ref to the registered entry, used to set properties
-  */
-  inline NDArrayFunctionReg &set_function(void(*fternary)(const NDArray &lhs,
-                                                          const NDArray &mhs,
-                                                          const NDArray &rhs,
-                                                                NDArray *out)) {
-    body = [fternary](NDArray **used_vars,
-      real_t *s, NDArray **mutate_vars,
-      int num_params, char **param_keys, char **param_vals) {
+   * \brief set the function body to a ternary NDArray function
+   *  this will also auto set the parameters correctly
+   * \param fternary function body to set
+   * \return ref to the registered entry, used to set properties
+   */
+  inline NDArrayFunctionReg& set_function(
+      void (*fternary)(const NDArray& lhs, const NDArray& mhs, const NDArray& rhs, NDArray* out)) {
+    body = [fternary](NDArray** used_vars,
+                      real_t* s,
+                      NDArray** mutate_vars,
+                      int num_params,
+                      char** param_keys,
+                      char** param_vals) {
       (*fternary)(*used_vars[0], *used_vars[1], *used_vars[2], mutate_vars[0]);
     };
-    num_use_vars = 3; num_mutate_vars = 1;
-    type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+    num_use_vars    = 3;
+    num_mutate_vars = 1;
+    type_mask       = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
     this->add_argument("lhs", "NDArray", "Left operand to the function.");
     this->add_argument("mhs", "NDArray", "Middle operand to the function.");
     this->add_argument("rhs", "NDArray", "Right operand to the function.");
@@ -1365,15 +1392,20 @@ struct NDArrayFunctionReg
    * \param fbinary function body to set
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_function(void (*fbinary)(const NDArray &lhs,
-                                                          const NDArray &rhs,
-                                                          NDArray *out)) {
-    body = [fbinary] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
-                      int num_params, char **param_keys, char **param_vals) {
+  inline NDArrayFunctionReg& set_function(void (*fbinary)(const NDArray& lhs,
+                                                          const NDArray& rhs,
+                                                          NDArray* out)) {
+    body = [fbinary](NDArray** used_vars,
+                     real_t* s,
+                     NDArray** mutate_vars,
+                     int num_params,
+                     char** param_keys,
+                     char** param_vals) {
       (*fbinary)(*used_vars[0], *used_vars[1], mutate_vars[0]);
     };
-    num_use_vars = 2; num_mutate_vars = 1;
-    type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+    num_use_vars    = 2;
+    num_mutate_vars = 1;
+    type_mask       = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
     this->add_argument("lhs", "NDArray", "Left operand to the function.");
     this->add_argument("rhs", "NDArray", "Right operand to the function.");
     return *this;
@@ -1384,15 +1416,20 @@ struct NDArrayFunctionReg
    * \param fscalar function body to set
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_function(void (*fscalar)(const NDArray &lhs,
-                                                          const real_t &rhs,
-                                                          NDArray *out)) {
-    body = [fscalar] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
-                      int num_params, char **param_keys, char **param_vals) {
-      (*fscalar)(*used_vars[0], s[0], mutate_vars[0]);
-    };
-    num_use_vars = 1; num_mutate_vars = 1; num_scalars = 1;
-    type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+  inline NDArrayFunctionReg& set_function(void (*fscalar)(const NDArray& lhs,
+                                                          const real_t& rhs,
+                                                          NDArray* out)) {
+    body = [fscalar](NDArray** used_vars,
+                     real_t* s,
+                     NDArray** mutate_vars,
+                     int num_params,
+                     char** param_keys,
+                     char** param_vals) { (*fscalar)(*used_vars[0], s[0], mutate_vars[0]); };
+
+    num_use_vars    = 1;
+    num_mutate_vars = 1;
+    num_scalars     = 1;
+    type_mask       = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
     this->add_argument("lhs", "NDArray", "Left operand to the function.");
     this->add_argument("rhs", "real_t", "Right operand to the function.");
     return *this;
@@ -1403,14 +1440,17 @@ struct NDArrayFunctionReg
    * \param funary function body to set
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_function(void (*funary)(const NDArray &src,
-                                                         NDArray *out)) {
-    body = [funary] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
-                     int num_params, char **param_keys, char **param_vals) {
-      (*funary)(*used_vars[0], mutate_vars[0]);
-    };
-    num_use_vars = 1; num_mutate_vars = 1;
-    type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+  inline NDArrayFunctionReg& set_function(void (*funary)(const NDArray& src, NDArray* out)) {
+    body = [funary](NDArray** used_vars,
+                    real_t* s,
+                    NDArray** mutate_vars,
+                    int num_params,
+                    char** param_keys,
+                    char** param_vals) { (*funary)(*used_vars[0], mutate_vars[0]); };
+
+    num_use_vars    = 1;
+    num_mutate_vars = 1;
+    type_mask       = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
     this->add_argument("src", "NDArray", "Source input to the function.");
     return *this;
   }
@@ -1420,13 +1460,17 @@ struct NDArrayFunctionReg
    * \param fgeneric function body to set
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_function(
-    void (*fgeneric)(NDArray **used_vars,
-                     real_t *s,
-                     NDArray **mutate_vars,
-                     const std::map<std::string, std::string>& param)) {
-    body = [fgeneric] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
-                       int num_params, char **param_keys, char **param_vals) {
+  inline NDArrayFunctionReg& set_function(
+      void (*fgeneric)(NDArray** used_vars,
+                       real_t* s,
+                       NDArray** mutate_vars,
+                       const std::map<std::string, std::string>& param)) {
+    body = [fgeneric](NDArray** used_vars,
+                      real_t* s,
+                      NDArray** mutate_vars,
+                      int num_params,
+                      char** param_keys,
+                      char** param_vals) {
       std::map<std::string, std::string> param;
       for (int i = 0; i < num_params; ++i) {
         param[param_keys[i]] = param_vals[i];
@@ -1440,32 +1484,36 @@ struct NDArrayFunctionReg
    * \param n number of mutate variablesx
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_num_use_vars(unsigned n) {
-    num_use_vars = n; return *this;
+  inline NDArrayFunctionReg& set_num_use_vars(unsigned n) {
+    num_use_vars = n;
+    return *this;
   }
   /*!
    * \brief set the number of mutate variables
    * \param n number of mutate variablesx
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_num_mutate_vars(unsigned n) {
-    num_mutate_vars = n; return *this;
+  inline NDArrayFunctionReg& set_num_mutate_vars(unsigned n) {
+    num_mutate_vars = n;
+    return *this;
   }
   /*!
    * \brief set the number of scalar arguments
    * \param n number of scalar arguments
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_num_scalars(unsigned n) {
-    num_scalars = n; return *this;
+  inline NDArrayFunctionReg& set_num_scalars(unsigned n) {
+    num_scalars = n;
+    return *this;
   }
   /*!
    * \brief set type mask
    * \param tmask typemask
    * \return ref to the registered entry, used to set properties
    */
-  inline NDArrayFunctionReg &set_type_mask(int tmask) {
-    type_mask = tmask; return *this;
+  inline NDArrayFunctionReg& set_type_mask(int tmask) {
+    type_mask = tmask;
+    return *this;
   }
 };  // NDArrayFunctionReg
 
@@ -1480,7 +1528,7 @@ struct NDArrayFunctionReg
  *
  * \endcode
  */
-#define MXNET_REGISTER_NDARRAY_FUN(name)                                 \
+#define MXNET_REGISTER_NDARRAY_FUN(name) \
   DMLC_REGISTRY_REGISTER(::mxnet::NDArrayFunctionReg, NDArrayFunctionReg, name)
 
 }  // namespace mxnet
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index d16b38e..c7188e3 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -23,19 +23,20 @@
  * \brief ndarry module of mxnet
  */
 #include <dmlc/io.h>
-#include <dmlc/memory_io.h>
 #include <dmlc/logging.h>
+#include <dmlc/memory_io.h>
 #include <dmlc/registry.h>
+#include <mshadow/tensor.h>
 #include <mxnet/base.h>
+#include <mxnet/imperative.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/resource.h>
-#include <mxnet/imperative.h>
-#include <mshadow/tensor.h>
-#include "./ndarray_function.h"
+
 #include "../common/utils.h"
-#include "../operator/tensor/matrix_op-inl.h"
-#include "../operator/tensor/init_op.h"
 #include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/tensor/init_op.h"
+#include "../operator/tensor/matrix_op-inl.h"
+#include "./ndarray_function.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -47,13 +48,17 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
 
 namespace mxnet {
 
-NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Context ctx,
-    bool delay_alloc, int dtype, std::vector<int> aux_types,
-    mxnet::ShapeVector aux_shapes, mxnet::TShape storage_shape) : shape_(shape),
-  dtype_(dtype), storage_type_(stype), entry_(nullptr) {
+NDArray::NDArray(const NDArrayStorageType stype,
+                 const mxnet::TShape& shape,
+                 Context ctx,
+                 bool delay_alloc,
+                 int dtype,
+                 std::vector<int> aux_types,
+                 mxnet::ShapeVector aux_shapes,
+                 mxnet::TShape storage_shape)
+    : shape_(shape), dtype_(dtype), storage_type_(stype), entry_(nullptr) {
   // Assign default aux types if not given
-  if (aux_types.size() == 0
-      && stype != kDefaultStorage) {
+  if (aux_types.size() == 0 && stype != kDefaultStorage) {
     if (stype == kRowSparseStorage) {
       aux_types = {mshadow::kInt64};
     } else if (stype == kCSRStorage) {
@@ -64,8 +69,7 @@ NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Con
   }
   // Assign default shapes if not given
   // unknown shapes are intialized as {0} such that Size() would return 0
-  if (aux_shapes.size() == 0
-      && stype != kDefaultStorage) {
+  if (aux_shapes.size() == 0 && stype != kDefaultStorage) {
     if (stype == kRowSparseStorage) {
       aux_shapes = {mxnet::TShape(mshadow::Shape1(0))};
     } else if (stype == kCSRStorage) {
@@ -75,10 +79,9 @@ NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Con
       LOG(FATAL) << "Unknown storage type " << stype;
     }
   }
-  if (storage_shape.Size() == 0
-      && stype != kDefaultStorage) {
+  if (storage_shape.Size() == 0 && stype != kDefaultStorage) {
     if (stype == kRowSparseStorage) {
-      storage_shape = shape;
+      storage_shape    = shape;
       storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
     } else if (stype == kCSRStorage) {
       storage_shape = aux_shapes[csr::kIdx];
@@ -89,8 +92,8 @@ NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Con
   if (stype == kDefaultStorage)
     ptr_ = std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype);
   else
-    ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
-        dtype, aux_types, aux_shapes);
+    ptr_ = std::make_shared<Chunk>(
+        stype, storage_shape, ctx, delay_alloc, dtype, aux_types, aux_shapes);
 }
 
 void NDArray::SetShapeFromChunk() {
@@ -111,38 +114,41 @@ struct ChunkMem {
 NDArray::Chunk::~Chunk() {
   bool skip_free = static_data || delay_alloc;
   ChunkMem mem;
-  mem.h = this->shandle;
+  mem.h     = this->shandle;
   mem.aux_h = this->aux_handles;
 #if MXNET_USE_MKLDNN == 1
   // We want to delete mkldnn memory after deleting the variable.
   mem.mem = this->mkl_mem_;
 #endif
   if (auto engine = engine_ref_.lock()) {
-    engine->DeleteVariable([mem, skip_free](RunContext s) {
-      if (skip_free == false) {
+    engine->DeleteVariable(
+        [mem, skip_free](RunContext s) {
+          if (skip_free == false) {
 #if MXNET_USE_MKLDNN == 1
-        if (mem.mem) {
-          CHECK_LE(mem.mem->GetSize(), mem.h.size);
-          CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
-        }
+            if (mem.mem) {
+              CHECK_LE(mem.mem->GetSize(), mem.h.size);
+              CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
+            }
 #endif
-        Storage::Get()->Free(mem.h);
-        for (const auto &aux : mem.aux_h) {
-          Storage::Get()->Free(aux);
-        }
-      }
-    }, shandle.ctx, var);
+            Storage::Get()->Free(mem.h);
+            for (const auto& aux : mem.aux_h) {
+              Storage::Get()->Free(aux);
+            }
+          }
+        },
+        shandle.ctx,
+        var);
   }
 }
 
-void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
-  CHECK_NE(aux_shapes.size(), 0)
-      << "data is expected to be allocated after aux_data";
+void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape& shape, int dtype) {
+  CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
   auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
   if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
-    CHECK_LT(shape.Size(), (int64_t{1} << 31) - 1) <<
-              "[CheckAndAllocData] Size of tensor you are trying to allocate is larger than "
-              "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+    CHECK_LT(shape.Size(), (int64_t{1} << 31) - 1)
+        << "[CheckAndAllocData] Size of tensor you are trying to allocate is "
+           "larger than "
+           "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
   }
   if (shandle.size < dbytes) {
     // free storage
@@ -160,7 +166,8 @@ void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
 }
 
 NDArray NDArray::grad() const {
-  if (Imperative::AGInfo::IsNone(*this)) return NDArray();
+  if (Imperative::AGInfo::IsNone(*this))
+    return NDArray();
   Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
   if (info.out_grads.size()) {
     CHECK_EQ(info.out_grads.size(), 1);
@@ -171,7 +178,8 @@ NDArray NDArray::grad() const {
 
 nnvm::Symbol NDArray::get_autograd_symbol() const {
   CHECK(!Imperative::AGInfo::IsNone(*this))
-    << "NDArray is not part of a computation graph. Did you forget to turn on recording?";
+      << "NDArray is not part of a computation graph. Did you forget to turn "
+         "on recording?";
   nnvm::Symbol ret;
   ret.outputs.emplace_back(entry_);
   return ret;
@@ -179,36 +187,35 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
 
 #if MXNET_USE_MKLDNN == 1
 
-NDArray::NDArray(const mkldnn::memory::desc &md)
-    : storage_type_(kDefaultStorage), entry_(nullptr) {
+NDArray::NDArray(const mkldnn::memory::desc& md) : storage_type_(kDefaultStorage), entry_(nullptr) {
   shape_ = mxnet::TShape(md.data.dims, md.data.dims + md.data.ndims);
   dtype_ = get_mxnet_type(md.data.data_type);
-  ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
+  ptr_   = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
   ptr_->CheckAndAlloc(md.get_size());
   ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(md, ptr_->shandle.dptr);
 }
 
-NDArray::NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem)
+NDArray::NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem)
     : storage_type_(kDefaultStorage), entry_(nullptr) {
-  auto mem_desc = mkldnn_mem->get_desc();
-  shape_ = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
-  dtype_ = get_mxnet_type(mem_desc.data.data_type);
-  ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
+  auto mem_desc      = mkldnn_mem->get_desc();
+  shape_             = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
+  dtype_             = get_mxnet_type(mem_desc.data.data_type);
+  ptr_               = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
   ptr_->shandle.dptr = mkldnn_mem->get_data_handle();
   ptr_->shandle.size = mem_desc.get_size();
-  ptr_->delay_alloc = false;
-  ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(mkldnn_mem);
-  ptr_->static_data = true;
+  ptr_->delay_alloc  = false;
+  ptr_->mkl_mem_     = std::make_shared<MKLDNNMemory>(mkldnn_mem);
+  ptr_->static_data  = true;
 }
 
-NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape &shape) const {
+NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape& shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
   CHECK_GE(shape_.Size(), shape.Size())
-    << "NDArray.Reshape: target shape size is larger current shape";
+      << "NDArray.Reshape: target shape size is larger current shape";
   CHECK_EQ(storage_type(), kDefaultStorage);
   if (!IsMKLDNNData()) {
     NDArray ret = this->Detach();
-    ret.shape_ = shape;
+    ret.shape_  = shape;
     return ret;
   } else {
     NDArray ret(shape, ctx(), true, dtype());
@@ -216,32 +223,32 @@ NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape &shape) const {
     // be called in operators.
     mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
     CHECK(ptr_->IsMKLDNN());
-    mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
-    mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_desc);
-    MKLDNNStream *stream = MKLDNNStream::Get();
+    mkldnn::memory::desc def_desc            = ptr_->mkl_mem_->GetDesc(format);
+    mkldnn::memory* def_mem                  = TmpMemMgr::Get()->Alloc(def_desc);
+    MKLDNNStream* stream                     = MKLDNNStream::Get();
     std::shared_ptr<mkldnn::memory> curr_mem = ptr_->mkl_mem_->GetMem();
     stream->RegisterMem(curr_mem);
-    std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *curr_mem},
-                                                 {MKLDNN_ARG_TO, *def_mem}});
+    std::unordered_map<int, mkldnn::memory> args(
+        {{MKLDNN_ARG_FROM, *curr_mem}, {MKLDNN_ARG_TO, *def_mem}});
     stream->RegisterPrimArgs(mkldnn::reorder(*curr_mem, *def_mem), args);
     // def_mem points to a memory region in the temp space. It's only valid
     // inside an operator. As such, the returned NDArray can only be valid
     // inside an operator and the shared point doesn't need to do anything
     // when it's destroyed.
-    auto tmp = std::shared_ptr<mkldnn::memory>(def_mem, [](mkldnn::memory *mem) {});
+    auto tmp = std::shared_ptr<mkldnn::memory>(def_mem, [](mkldnn::memory* mem) {});
     ret.ptr_->mkl_mem_.reset(new MKLDNNMemory(tmp));
     ret.ptr_->shandle.dptr = def_mem->get_data_handle();
     ret.ptr_->shandle.size = def_mem->get_desc().get_size();
-    ret.ptr_->delay_alloc = false;
-    ret.ptr_->static_data = true;
-    ret.byte_offset_ = byte_offset_;
-    ret.reuse_ = false;
+    ret.ptr_->delay_alloc  = false;
+    ret.ptr_->static_data  = true;
+    ret.byte_offset_       = byte_offset_;
+    ret.reuse_             = false;
     return ret;
   }
 }
 #endif
 
-NDArray NDArray::Reshape(const mxnet::TShape &shape) const {
+NDArray NDArray::Reshape(const mxnet::TShape& shape) const {
   CHECK(!is_none()) << "NDArray is not initialized";
   if (Imperative::Get()->is_np_shape()) {
     CHECK_EQ(shape_.Size(), shape.Size())
@@ -249,7 +256,8 @@ NDArray NDArray::Reshape(const mxnet::TShape &shape) const {
         << "current shape.";
   } else {
     CHECK_GE(shape_.Size(), shape.Size())
-        << "NDArray.Reshape: target shape size is larger than the current shape";
+        << "NDArray.Reshape: target shape size is larger than the current "
+           "shape";
   }
   NDArray ret = this->Detach();
   // If the shape doesn't change, we can just return it now.
@@ -262,15 +270,16 @@ NDArray NDArray::Reshape(const mxnet::TShape &shape) const {
   return ret;
 }
 
-NDArray NDArray::ReshapeWithRecord(const mxnet::TShape &shape) {
+NDArray NDArray::ReshapeWithRecord(const mxnet::TShape& shape) {
   NDArray ret = this->Reshape(shape);
-  if (!Imperative::Get()->is_recording()) return ret;
+  if (!Imperative::Get()->is_recording())
+    return ret;
 
   CHECK_EQ(shape_.Size(), shape.Size())
-    << "NDArray.Reshape: target shape must have the same size as "
-    << "current shape when recording with autograd.";
+      << "NDArray.Reshape: target shape must have the same size as "
+      << "current shape when recording with autograd.";
   nnvm::NodeAttrs attrs;
-  attrs.op = nnvm::Op::Get("Reshape");;
+  attrs.op = nnvm::Op::Get("Reshape");
   std::ostringstream os;
   os << shape;
   attrs.dict.insert({"shape", os.str()});
@@ -282,23 +291,22 @@ NDArray NDArray::ReshapeWithRecord(const mxnet::TShape &shape) {
 
 NDArray NDArray::Slice(index_t begin, index_t end) const {
   CHECK(!is_none()) << "NDArray is empty";
-  CHECK_LE(begin, end)
-      << "Invalid slicing range [" << begin << ", " << end << ")";
+  CHECK_LE(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")";
   CHECK_GE(shape_[0], end) << "Slice end index out of range";
   CHECK_EQ(storage_type(), kDefaultStorage);
-  NDArray ret = this->Detach();
+  NDArray ret   = this->Detach();
   size_t length = shape_.ProdShape(1, shape_.ndim());
-  MSHADOW_TYPE_SWITCH_WITH_BOOL(ret.dtype(), DType, {
-    ret.byte_offset_ += begin * length * sizeof(DType);
-  });
-  ret.reuse_ = false;
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(
+      ret.dtype(), DType, { ret.byte_offset_ += begin * length * sizeof(DType); });
+  ret.reuse_    = false;
   ret.shape_[0] = end - begin;
   return ret;
 }
 
 NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
   NDArray ret = this->Slice(begin, end);
-  if (!Imperative::Get()->is_recording()) return ret;
+  if (!Imperative::Get()->is_recording())
+    return ret;
   // fake a slice op
   nnvm::NodeAttrs attrs;
   attrs.op = nnvm::Op::Get("slice");
@@ -313,9 +321,9 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
 NDArray NDArray::At(index_t idx) const {
   CHECK(storage_type() == kDefaultStorage)
       << "Storage type " << storage_type() << " doesn't support At()";
-  NDArray ret = this->Slice(idx, idx+1);
+  NDArray ret = this->Slice(idx, idx + 1);
   if (shape_.ndim() > 1) {
-    return ret.Reshape(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
+    return ret.Reshape(mxnet::TShape(shape_.data() + 1, shape_.data() + shape_.ndim()));
   } else {
     return ret;
   }
@@ -324,9 +332,9 @@ NDArray NDArray::At(index_t idx) const {
 NDArray NDArray::AtWithRecord(index_t idx) {
   CHECK(storage_type() == kDefaultStorage)
       << "Storage type " << storage_type() << " doesn't support At()";
-  NDArray ret = this->SliceWithRecord(idx, idx+1);
+  NDArray ret = this->SliceWithRecord(idx, idx + 1);
   if (shape_.ndim() > 1 || Imperative::Get()->is_np_shape()) {
-    return ret.ReshapeWithRecord(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
+    return ret.ReshapeWithRecord(mxnet::TShape(shape_.data() + 1, shape_.data() + shape_.ndim()));
   } else {
     return ret;
   }
@@ -359,20 +367,19 @@ struct NDArrayDLManager {
 DLManagedTensor* NDArray::ToDLPack() const {
   CHECK(!is_none()) << "NDArray is not initialized";
   NDArrayDLManager* dlmanager(new NDArrayDLManager);
-  dlmanager->handle = *this;
-  dlmanager->tensor.dl_tensor = dlmanager->handle.data().dltensor();
+  dlmanager->handle             = *this;
+  dlmanager->tensor.dl_tensor   = dlmanager->handle.data().dltensor();
   dlmanager->tensor.manager_ctx = dlmanager;
-  dlmanager->tensor.deleter = [](DLManagedTensor* dlmanager){
+  dlmanager->tensor.deleter     = [](DLManagedTensor* dlmanager) {
     delete static_cast<NDArrayDLManager*>(dlmanager->manager_ctx);
   };
   return &(dlmanager->tensor);
 }
 
 NDArray NDArray::FromDLPack(const DLManagedTensor* tensor, bool transient_handle) {
-  DLManagedTensor *tensor_copy = transient_handle
-                               ? new DLManagedTensor(*tensor)
-                               : const_cast<DLManagedTensor*>(tensor);
-  auto deleter = [tensor_copy, transient_handle](){
+  DLManagedTensor* tensor_copy =
+      transient_handle ? new DLManagedTensor(*tensor) : const_cast<DLManagedTensor*>(tensor);
+  auto deleter = [tensor_copy, transient_handle]() {
     if (tensor_copy->deleter != nullptr) {
       tensor_copy->deleter(tensor_copy);
     }
@@ -384,17 +391,18 @@ NDArray NDArray::FromDLPack(const DLManagedTensor* tensor, bool transient_handle
 }
 
 bool NDArray::fresh_out_grad() const {
-  if (Imperative::AGInfo::IsNone(*this)) return false;
+  if (Imperative::AGInfo::IsNone(*this))
+    return false;
   Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
   return info.fresh_out_grad;
 }
 
-
 void NDArray::set_fresh_out_grad(bool state) const {
   CHECK(!Imperative::AGInfo::IsNone(*this))
-    << "NDArray has not been marked as a variable and does not have gradient state";
+      << "NDArray has not been marked as a variable and does not have gradient "
+         "state";
   Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
-  info.fresh_out_grad = state;
+  info.fresh_out_grad      = state;
 }
 
 #if MXNET_USE_MKLDNN == 1
@@ -424,7 +432,7 @@ void NDArray::Chunk::Reorder2Default() {
   if (IsDefault())
     return;
 
-  mkldnn_format_tag_t format = mkl_mem_->GetDefaultFormat();
+  mkldnn_format_tag_t format    = mkl_mem_->GetDefaultFormat();
   mkldnn::memory::desc def_desc = mkl_mem_->GetDesc(format);
   mkldnn_mem_ptr def_mem(new mkldnn::memory(def_desc, CpuEngine::Get()->get_engine()));
   mkl_mem_->ReorderTo(def_mem.get());
@@ -436,7 +444,7 @@ void NDArray::Chunk::Reorder2Default() {
   mkl_mem_ = nullptr;
 }
 
-void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
+void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc& md) {
   // If the memory already uses the specified layout, don't do anything.
   if (mkl_mem_ != nullptr && mkl_mem_->SameFormat(md))
     return;
@@ -456,7 +464,7 @@ void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
   std::shared_ptr<mkldnn::memory> old_mem;
   if (IsDefault()) {
     mkldnn_format_tag_t def_format = GetDefaultFormat(md);
-    mkldnn::memory::desc def_desc = GetDesc(md, def_format);
+    mkldnn::memory::desc def_desc  = GetDesc(md, def_format);
     old_mem.reset(new mkldnn::memory(def_desc, engine, shandle.dptr));
   } else {
     old_mem = this->mkl_mem_->GetMem();
@@ -473,12 +481,11 @@ void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
   mkl_mem_.reset(new MKLDNNMemory(md, shandle.dptr));
 }
 
-void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
+void NDArray::Chunk::SetMKLMem(const mxnet::TShape& shape, int dtype) {
   // The shape of the array and the one of the MKL memory may mismatch.
   // For example, if the array stores parameters, the MKL memory may store data
   // in 5 dimensions while the NDArray stores data in 4 dimensions.
-  if (mkl_mem_ && mkl_mem_->GetDataHandle() == shandle.dptr
-      && mkl_mem_->SameFormat(shape, dtype)) {
+  if (mkl_mem_ && mkl_mem_->GetDataHandle() == shandle.dptr && mkl_mem_->SameFormat(shape, dtype)) {
     return;
   }
 
@@ -493,12 +500,24 @@ void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
   }
   mkldnn::memory::format_tag layout = mkldnn::memory::format_tag::undef;
   switch (dims.size()) {
-    case 1: layout = mkldnn::memory::format_tag::a; break;
-    case 2: layout = mkldnn::memory::format_tag::ab; break;
-    case 3: layout = mkldnn::memory::format_tag::abc; break;
-    case 4: layout = mkldnn::memory::format_tag::abcd; break;
-    case 5: layout = mkldnn::memory::format_tag::abcde; break;
-    case 6: layout = mkldnn::memory::format_tag::abcdef; break;
+    case 1:
+      layout = mkldnn::memory::format_tag::a;
+      break;
+    case 2:
+      layout = mkldnn::memory::format_tag::ab;
+      break;
+    case 3:
+      layout = mkldnn::memory::format_tag::abc;
+      break;
+    case 4:
+      layout = mkldnn::memory::format_tag::abcd;
+      break;
+    case 5:
+      layout = mkldnn::memory::format_tag::abcde;
+      break;
+    case 6:
+      layout = mkldnn::memory::format_tag::abcdef;
+      break;
     default:
       LOG(FATAL) << "Not implemented dimension (" << dims.size() << ") for MKLDNN";
   }
@@ -511,12 +530,12 @@ void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
   mkl_mem_.reset(new MKLDNNMemory(data_md, shandle.dptr));
 }
 
-const mkldnn::memory *NDArray::GetMKLDNNData(const mkldnn::memory::desc &desc) const {
+const mkldnn::memory* NDArray::GetMKLDNNData(const mkldnn::memory::desc& desc) const {
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
     LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
     return nullptr;
   }
-  const mkldnn::memory *mem = GetMKLDNNData();
+  const mkldnn::memory* mem  = GetMKLDNNData();
   mkldnn::memory::desc desc1 = mem->get_desc();
   // The MKL memory has the same format and shape as required,
   // or both use the default format, we can return the MKL memory.
@@ -527,13 +546,12 @@ const mkldnn::memory *NDArray::GetMKLDNNData(const mkldnn::memory::desc &desc) c
   }
 }
 
-const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
-    const mkldnn::memory::desc &new_desc) const {
+const mkldnn::memory* NDArray::GetMKLDNNDataReorder(const mkldnn::memory::desc& new_desc) const {
   CHECK(storage_type() == kDefaultStorage);
 
-  const mkldnn::memory *mem = GetMKLDNNData();
+  const mkldnn::memory* mem = GetMKLDNNData();
   // If the memory descriptor matches, it's easy.
-  MKLDNNStream *stream = MKLDNNStream::Get();
+  MKLDNNStream* stream = MKLDNNStream::Get();
   if (mem->get_desc() == new_desc) {
     return GetMKLDNNExact(mem, new_desc);
   }
@@ -542,13 +560,13 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
   // Now we need to determine if we should reorder the memory.
   // If both use the default formats, we think we don't need to reorder.
   if ((!mxnet::IsMKLDNN(old_desc)) && (!mxnet::IsMKLDNN(new_desc))) {
-    mkldnn_mem_ptr ret(new mkldnn::memory(new_desc,
-        CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+    mkldnn_mem_ptr ret(
+        new mkldnn::memory(new_desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
     stream->RegisterMem(ret);
     return ret.get();
   } else if (same_shape(old_desc, new_desc)) {
     // If they have the same shape, we can reorder data directly.
-    mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(new_desc);
+    mkldnn::memory* ret = TmpMemMgr::Get()->Alloc(new_desc);
     std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
     stream->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
     return ret;
@@ -559,14 +577,14 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
     mxnet::TShape required_shape(new_desc.data.ndims, -1);
     for (int i = 0; i < new_desc.data.ndims; i++)
       required_shape[i] = new_desc.data.dims[i];
-    NDArray reshaped = MKLDNNDataReshape(required_shape);
-    const mkldnn::memory *ret = reshaped.GetMKLDNNData();
+    NDArray reshaped          = MKLDNNDataReshape(required_shape);
+    const mkldnn::memory* ret = reshaped.GetMKLDNNData();
     if (ret->get_desc() == new_desc) {
       return GetMKLDNNExact(ret, new_desc);
     } else {
-      mkldnn::memory *ret2 = TmpMemMgr::Get()->Alloc(new_desc);
-      std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *ret},
-                                                   {MKLDNN_ARG_TO, *ret2}});
+      mkldnn::memory* ret2 = TmpMemMgr::Get()->Alloc(new_desc);
+      std::unordered_map<int, mkldnn::memory> args(
+          {{MKLDNN_ARG_FROM, *ret}, {MKLDNN_ARG_TO, *ret2}});
       stream->RegisterPrimArgs(mkldnn::reorder(*ret, *ret2), args);
       return ret2;
     }
@@ -584,17 +602,18 @@ NDArray NDArray::Reorder2Default() const {
   // create new ndarray from  mkldnn layout
   mkldnn::memory::desc from_desc = ptr_->mkl_mem_->GetDesc();
   mxnet::TShape tshape(from_desc.data.ndims, -1);
-  for (int i = 0; i < from_desc.data.ndims; i++) tshape[i] = from_desc.data.dims[i];
+  for (int i = 0; i < from_desc.data.ndims; i++)
+    tshape[i] = from_desc.data.dims[i];
   NDArray ret(tshape, ctx(), false, dtype());
-  mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
+  mkldnn_format_tag_t format    = ptr_->mkl_mem_->GetDefaultFormat();
   mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
   CHECK(ret.ptr_->shandle.size >= def_desc.get_size());
   mkldnn::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ret.ptr_->shandle.dptr);
   ptr_->mkl_mem_->ReorderTo(&def_mem);
   // reshape as needed
-  ret.shape_ = shape_;
+  ret.shape_       = shape_;
   ret.byte_offset_ = byte_offset_;
-  ret.reuse_ = false;
+  ret.reuse_       = false;
   return ret;
 }
 
@@ -603,17 +622,22 @@ void NDArray::Reorder2DefaultAsync() const {
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
   NDArray tmp = *this;
   Engine::Get()->PushAsync(
-    [tmp](RunContext ctx, Engine::CallbackOnComplete on_complete) {
-      tmp.ptr_->Reorder2Default();
-      on_complete();
-    }, ctx(), const_vars, mutable_vars,
-    FnProperty::kNormal, 0, "Reorder2Default");
+      [tmp](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+        tmp.ptr_->Reorder2Default();
+        on_complete();
+      },
+      ctx(),
+      const_vars,
+      mutable_vars,
+      FnProperty::kNormal,
+      0,
+      "Reorder2Default");
 }
 
 // now just support bf16->fp32
 NDArray NDArray::Reorder2DefaultFloatFormat() const {
   CHECK(storage_type() == kDefaultStorage && IsView() == false);
-  if (dtype() !=  mshadow::kBfloat16) {
+  if (dtype() != mshadow::kBfloat16) {
     return Reorder2Default();
   }
   NDArray ret(shape(), ctx(), false, mshadow::DataType<float>::kFlag);
@@ -624,24 +648,29 @@ NDArray NDArray::Reorder2DefaultFloatFormat() const {
   return ret;
 }
 
-void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc &desc) const {
+void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
   std::vector<Engine::VarHandle> const_vars;
   std::vector<Engine::VarHandle> mutable_vars(1, this->var());
-  NDArray tmp = *this;
+  NDArray tmp        = *this;
   const auto version = this->version();
   Engine::Get()->PushAsync(
-    [tmp, version, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
-      // MXNet will try to reuse NDArray from memory planning, so we need to ensure
-      // the NDArray is still holding the original trunk data.
-      if (tmp.version() == version) {
-        tmp.ptr_->MKLDNNDataReorder(desc);
-      }
-      on_complete();
-    }, ctx(), const_vars, mutable_vars,
-    FnProperty::kNormal, 0, "Reorder");
+      [tmp, version, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+        // MXNet will try to reuse NDArray from memory planning, so we need to
+        // ensure the NDArray is still holding the original trunk data.
+        if (tmp.version() == version) {
+          tmp.ptr_->MKLDNNDataReorder(desc);
+        }
+        on_complete();
+      },
+      ctx(),
+      const_vars,
+      mutable_vars,
+      FnProperty::kNormal,
+      0,
+      "Reorder");
 }
 
-const mkldnn::memory *NDArray::GetMKLDNNData() const {
+const mkldnn::memory* NDArray::GetMKLDNNData() const {
   CHECK(storage_type() == kDefaultStorage);
   bool is_view = IsView();
   if (IsMKLDNNData()) {
@@ -657,13 +686,13 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const {
     // because we don't have the complete data type and shape information for
     // the chunk.
     CheckAndAlloc();
-    void *off_addr = static_cast<char *>(ptr_->shandle.dptr) + byte_offset_;
+    void* off_addr = static_cast<char*>(ptr_->shandle.dptr) + byte_offset_;
     // Create the primitive desc for the new mkldnn memory.
     mkldnn::memory::dims dims(shape().ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape()[i];
-    mkldnn::memory::format_tag cpp_format = static_cast<mkldnn::memory::format_tag>(
-        GetDefaultFormat(shape().ndim()));
+    mkldnn::memory::format_tag cpp_format =
+        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(shape().ndim()));
     mkldnn::memory::data_type cpp_type = get_mkldnn_type(dtype_);
     mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
     std::shared_ptr<mkldnn::memory> ret(
@@ -686,7 +715,7 @@ void NDArray::InvalidateMKLDNNData() {
     ptr_->mkl_mem_ = nullptr;
 }
 
-void NDArray::CopyFrom(const mkldnn::memory &mem) {
+void NDArray::CopyFrom(const mkldnn::memory& mem) {
   CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
   if (ptr_->mkl_mem_ && ptr_->mkl_mem_->GetRaw() == &mem)
     return;
@@ -699,15 +728,16 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
   if (IsMKLDNNData() && IsView())
     ptr_->Reorder2Default();
 
-  const mkldnn::memory *this_mem = GetMKLDNNData();
+  const mkldnn::memory* this_mem = GetMKLDNNData();
   MKLDNNMemoryCopy(mem, this_mem);
 }
 
-mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
+mkldnn::memory* NDArray::CreateMKLDNNData(const mkldnn::memory::desc& desc) {
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
-    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc. "
-        << "MKLDNN memory requests for " << desc.get_size() << " bytes, but got "
-        << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
+    LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN "
+                  "memory desc. "
+               << "MKLDNN memory requests for " << desc.get_size() << " bytes, but got "
+               << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
     return nullptr;
   }
   bool isDefaultFormat = IsDefaultFormat(desc);
@@ -720,8 +750,10 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
     CHECK(ptr_->shandle.dptr);
     // When this is a view and a user wants the default layout, we can simply
     // create a new mkldnn memory that points to the right memory.
-    std::shared_ptr<mkldnn::memory> mem(new mkldnn::memory(desc,
-        CpuEngine::Get()->get_engine(), static_cast<char *>(ptr_->shandle.dptr) + byte_offset_));
+    std::shared_ptr<mkldnn::memory> mem(
+        new mkldnn::memory(desc,
+                           CpuEngine::Get()->get_engine(),
+                           static_cast<char*>(ptr_->shandle.dptr) + byte_offset_));
     MKLDNNStream::Get()->RegisterMem(mem);
     return mem.get();
   } else if (IsView()) {
@@ -747,9 +779,9 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
   return ptr_->mkl_mem_->GetRaw();
 }
 
-void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc) {
-  auto new_desc = desc;
-  auto this_dtype = get_mkldnn_type(dtype());
+void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc) {
+  auto new_desc           = desc;
+  auto this_dtype         = get_mkldnn_type(dtype());
   new_desc.data.data_type = static_cast<mkldnn_data_type_t>(this_dtype);
   ptr_->mkl_mem_.reset(new MKLDNNMemory(new_desc, ptr_->shandle.dptr));
   MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
@@ -760,12 +792,12 @@ void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc) {
 void NDArray::SetTBlob() const {
   CHECK(ptr_ != nullptr);
   mxnet::TShape shape = shape_;
-  char *dptr = static_cast<char*>(ptr_->shandle.dptr);
-  auto stype = storage_type();
+  char* dptr          = static_cast<char*>(ptr_->shandle.dptr);
+  auto stype          = storage_type();
   if (stype == kDefaultStorage) {
 #if MXNET_USE_MKLDNN == 1
     CHECK(!IsMKLDNNData()) << "We can't generate TBlob for MKLDNN data. "
-        << "Please use Reorder2Default() to generate a new NDArray first";
+                           << "Please use Reorder2Default() to generate a new NDArray first";
 #endif
     dptr += byte_offset_;
   } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
@@ -774,27 +806,24 @@ void NDArray::SetTBlob() const {
   } else {
     LOG(FATAL) << "unknown storage type " << stype;
   }
-  tblob_.dptr_ = dptr;
-  tblob_.shape_ = shape;
+  tblob_.dptr_      = dptr;
+  tblob_.shape_     = shape;
   tblob_.type_flag_ = dtype_;
   tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
 }
 
 /*!
-* \brief run a ternary operation
-* \param lhs left operand
-* \param mhs middle operand
-* \param rhs right operand
-* \param out the output ndarray
-*/
-template<typename OP>
-void TernaryOp(const NDArray &lhs,
-  const NDArray &mhs,
-  const NDArray &rhs,
-  NDArray *out) {
+ * \brief run a ternary operation
+ * \param lhs left operand
+ * \param mhs middle operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ */
+template <typename OP>
+void TernaryOp(const NDArray& lhs, const NDArray& mhs, const NDArray& rhs, NDArray* out) {
   // no check if all of them are on cpu
-  if (lhs.ctx().dev_mask() != cpu::kDevMask || mhs.ctx().dev_mask() != cpu::kDevMask
-                                            || rhs.ctx().dev_mask() != cpu::kDevMask) {
+  if (lhs.ctx().dev_mask() != cpu::kDevMask || mhs.ctx().dev_mask() != cpu::kDevMask ||
+      rhs.ctx().dev_mask() != cpu::kDevMask) {
     CHECK((lhs.ctx() == mhs.ctx()) && (mhs.ctx() == rhs.ctx())) << "operands context mismatch";
   }
   // if out is none, allocate space
@@ -802,60 +831,75 @@ void TernaryOp(const NDArray &lhs,
     *out = NDArray(OP::GetShape(lhs.shape(), mhs.shape(), rhs.shape()), lhs.ctx(), true);
   } else {
     // no check if both of them are on cpu
-    if (lhs.ctx().dev_mask() != cpu::kDevMask ||
-      out->ctx().dev_mask() != cpu::kDevMask) {
+    if (lhs.ctx().dev_mask() != cpu::kDevMask || out->ctx().dev_mask() != cpu::kDevMask) {
       CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
     }
     CHECK(out->shape() == OP::GetShape(lhs.shape(), mhs.shape(), rhs.shape()))
-      << "target shape mismatch";
+        << "target shape mismatch";
   }
   // important: callback must always capture by value
   NDArray ret = *out;
   // get the const variables
   std::vector<Engine::VarHandle> const_vars;
-  if (lhs.var() != ret.var()) const_vars.push_back(lhs.var());
-  if (mhs.var() != ret.var()) const_vars.push_back(mhs.var());
-  if (rhs.var() != ret.var()) const_vars.push_back(rhs.var());
+  if (lhs.var() != ret.var())
+    const_vars.push_back(lhs.var());
+  if (mhs.var() != ret.var())
+    const_vars.push_back(mhs.var());
+  if (rhs.var() != ret.var())
+    const_vars.push_back(rhs.var());
 
   // redirect everything to mshadow operations
   switch (lhs.ctx().dev_mask()) {
-  case cpu::kDevMask: {
-    Engine::Get()->PushSync([lhs, mhs, rhs, ret](RunContext ctx) {
-      TBlob tmp = ret.data();
-      ndarray::Eval<cpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
-    }, lhs.ctx(), const_vars, { ret.var() },
-    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-    break;
-  }
+    case cpu::kDevMask: {
+      Engine::Get()->PushSync(
+          [lhs, mhs, rhs, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::Eval<cpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
+          },
+          lhs.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
+      break;
+    }
 #if MXNET_USE_CUDA
-  case gpu::kDevMask: {
-    Engine::Get()->PushSync([lhs, mhs, rhs, ret](RunContext ctx) {
-      TBlob tmp = ret.data();
-      ndarray::Eval<gpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
-      // Wait GPU kernel to complete
-      ctx.get_stream<gpu>()->Wait();
-    }, lhs.ctx(), const_vars, { ret.var() },
-    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-    break;
-  }
+    case gpu::kDevMask: {
+      Engine::Get()->PushSync(
+          [lhs, mhs, rhs, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::Eval<gpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
+            // Wait GPU kernel to complete
+            ctx.get_stream<gpu>()->Wait();
+          },
+          lhs.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
+      break;
+    }
 #endif
-  default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    default:
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }
 
 /*!
-* \brief Performs some preparation required to apply binary operators.
-* Checks context and shape of ndarrays, allocates space for output
-* and prepares const variables for engine
-* \param lhs left operand
-* \param rhs right operand
-* \param out the output ndarray
-* \param binary_op the real operation
-*/
-template<typename OP>
-std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray &lhs,
-                                               const NDArray &rhs,
-                                               NDArray *out) {
+ * \brief Performs some preparation required to apply binary operators.
+ * Checks context and shape of ndarrays, allocates space for output
+ * and prepares const variables for engine
+ * \param lhs left operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ * \param binary_op the real operation
+ */
+template <typename OP>
+std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray& lhs,
+                                               const NDArray& rhs,
+                                               NDArray* out) {
   // no check if both of them are on cpu
   if (lhs.ctx().dev_mask() != cpu::kDevMask || rhs.ctx().dev_mask() != cpu::kDevMask) {
     CHECK(lhs.ctx() == rhs.ctx()) << "operands context mismatch";
@@ -865,59 +909,69 @@ std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray &lhs,
     *out = NDArray(OP::GetShape(lhs.shape(), rhs.shape()), lhs.ctx(), true, lhs.dtype());
   } else {
     // no check if both of them are on cpu
-    if (lhs.ctx().dev_mask() != cpu::kDevMask ||
-        out->ctx().dev_mask() != cpu::kDevMask) {
+    if (lhs.ctx().dev_mask() != cpu::kDevMask || out->ctx().dev_mask() != cpu::kDevMask) {
       CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
     }
-    CHECK(out->shape() == OP::GetShape(lhs.shape(), rhs.shape()))
-      << "target shape mismatch";
+    CHECK(out->shape() == OP::GetShape(lhs.shape(), rhs.shape())) << "target shape mismatch";
   }
   std::vector<Engine::VarHandle> const_vars;
   // prepare const variables for engine
-  if (lhs.var() != out->var()) const_vars.push_back(lhs.var());
-  if (rhs.var() != out->var()) const_vars.push_back(rhs.var());
+  if (lhs.var() != out->var())
+    const_vars.push_back(lhs.var());
+  if (rhs.var() != out->var())
+    const_vars.push_back(rhs.var());
   return const_vars;
 }
 
 /*!
-* \brief run a binary operation using the kernel launch method
-* \param lhs left operand
-* \param rhs right operand
-* \param out the output ndarray
-* \param binary_op the real operation
-*/
-template<typename OP>
-void BinaryOpKernel(const NDArray &lhs,
-                    const NDArray &rhs,
-                    NDArray *out) {
+ * \brief run a binary operation using the kernel launch method
+ * \param lhs left operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ * \param binary_op the real operation
+ */
+template <typename OP>
+void BinaryOpKernel(const NDArray& lhs, const NDArray& rhs, NDArray* out) {
   std::vector<Engine::VarHandle> const_vars = BinaryOpPrepare<OP>(lhs, rhs, out);
   // important: callback must always capture by value
   NDArray ret = *out;
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
-      Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
-        TBlob tmp = ret.data();
-        mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
-        ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
-      },
-      lhs.ctx(), const_vars, {ret.var()},
-      FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [lhs, rhs, ret](RunContext ctx) {
+            TBlob tmp               = ret.data();
+            mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+            ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
+          },
+          lhs.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
-      Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
-        TBlob tmp = ret.data();
-        mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
-        ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
-        // Wait GPU kernel to complete
-        ctx.get_stream<gpu>()->Wait();
-      }, lhs.ctx(), const_vars, {ret.var()},
-      FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [lhs, rhs, ret](RunContext ctx) {
+            TBlob tmp               = ret.data();
+            mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+            ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
+            // Wait GPU kernel to complete
+            ctx.get_stream<gpu>()->Wait();
+          },
+          lhs.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
-}
+    }
 #endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    default:
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }
 
@@ -928,71 +982,89 @@ void BinaryOpKernel(const NDArray &lhs,
  * \param out the output ndarray
  * \param binary_op the real operation
  */
-template<typename OP>
-void BinaryOp(const NDArray &lhs,
-              const NDArray &rhs,
-              NDArray *out) {
+template <typename OP>
+void BinaryOp(const NDArray& lhs, const NDArray& rhs, NDArray* out) {
   std::vector<Engine::VarHandle> const_vars = BinaryOpPrepare<OP>(lhs, rhs, out);
   // important: callback must always capture by value
   NDArray ret = *out;
   // redirect everything to mshadow operations
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
-        Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
+      Engine::Get()->PushSync(
+          [lhs, rhs, ret](RunContext ctx) {
             TBlob tmp = ret.data();
             ndarray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
-          }, lhs.ctx(), const_vars, {ret.var()},
-          FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+          },
+          lhs.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
-      Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-        }, lhs.ctx(), const_vars, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [lhs, rhs, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
+            // Wait GPU kernel to complete
+            ctx.get_stream<gpu>()->Wait();
+          },
+          lhs.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    default:
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }
 
-void SetValueOp(const real_t &rhs, NDArray *out) {
+void SetValueOp(const real_t& rhs, NDArray* out) {
   CHECK_NE(out->is_none(), true) << "Set value target must not be empty";
   // important: callback must always capture by value
-  NDArray ret = *out;
+  NDArray ret                    = *out;
   const NDArrayStorageType stype = ret.storage_type();
-  Engine::Get()->PushSync([rhs, ret, stype](RunContext ctx) {
-      TBlob tmp = ret.data();
-      switch (ret.ctx().dev_mask()) {
-        case cpu::kDevMask: {
-          if (stype == kDefaultStorage) {
-            ndarray::Eval<cpu>(rhs, &tmp, ctx);
-          } else {
-            ndarray::Eval(ctx.get_stream<cpu>(), rhs, ret);
+  Engine::Get()->PushSync(
+      [rhs, ret, stype](RunContext ctx) {
+        TBlob tmp = ret.data();
+        switch (ret.ctx().dev_mask()) {
+          case cpu::kDevMask: {
+            if (stype == kDefaultStorage) {
+              ndarray::Eval<cpu>(rhs, &tmp, ctx);
+            } else {
+              ndarray::Eval(ctx.get_stream<cpu>(), rhs, ret);
+            }
+            break;
           }
-          break;
-        }
 #if MXNET_USE_CUDA
-        case gpu::kDevMask: {
-          if (stype == kDefaultStorage) {
-            ndarray::Eval<gpu>(rhs, &tmp, ctx);
-          } else {
-            ndarray::Eval(ctx.get_stream<gpu>(), rhs, ret);
+          case gpu::kDevMask: {
+            if (stype == kDefaultStorage) {
+              ndarray::Eval<gpu>(rhs, &tmp, ctx);
+            } else {
+              ndarray::Eval(ctx.get_stream<gpu>(), rhs, ret);
+            }
+            // Wait GPU kernel to complete
+            ctx.get_stream<gpu>()->Wait();
+            break;
           }
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-          break;
-        }
 #endif
-        default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-      }
-    }, ret.ctx(), {}, {ret.var()},
-  FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+          default:
+            LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+        }
+      },
+      ret.ctx(),
+      {},
+      {ret.var()},
+      FnProperty::kNormal,
+      0,
+      PROFILER_MESSAGE_FUNCNAME);
 }
 
 /*!
@@ -1002,10 +1074,8 @@ void SetValueOp(const real_t &rhs, NDArray *out) {
  * \param out the output ndarray
  * \param binary_op the real
  */
-template<typename OP, bool reverse>
-void ScalarOp(const NDArray &lhs,
-              const real_t &rhs,
-              NDArray *out) {
+template <typename OP, bool reverse>
+void ScalarOp(const NDArray& lhs, const real_t& rhs, NDArray* out) {
   if (out->is_none()) {
     *out = NDArray(lhs.shape(), lhs.ctx(), true, lhs.dtype());
   } else {
@@ -1016,47 +1086,69 @@ void ScalarOp(const NDArray &lhs,
   NDArray ret = *out;
   // get the const variables
   std::vector<Engine::VarHandle> const_vars;
-  if (lhs.var() != ret.var()) const_vars.push_back(lhs.var());
+  if (lhs.var() != ret.var())
+    const_vars.push_back(lhs.var());
 
   // redirect everything to mshadow operations
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
-      Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Eval<cpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
-        }, lhs.ctx(), const_vars, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [lhs, rhs, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::Eval<cpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
+          },
+          lhs.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
-      Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Eval<gpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-        }, lhs.ctx(), const_vars, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [lhs, rhs, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::Eval<gpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
+            // Wait GPU kernel to complete
+            ctx.get_stream<gpu>()->Wait();
+          },
+          lhs.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    default:
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }
 
 size_t num_aux_data(NDArrayStorageType stype) {
   size_t num = 0;
   switch (stype) {
-    case kDefaultStorage: num = 0; break;
-    case kCSRStorage: num = 2; break;
-    case kRowSparseStorage: num = 1; break;
-     default: LOG(FATAL) << "Unknown storage type" << stype; break;
+    case kDefaultStorage:
+      num = 0;
+      break;
+    case kCSRStorage:
+      num = 2;
+      break;
+    case kRowSparseStorage:
+      num = 1;
+      break;
+    default:
+      LOG(FATAL) << "Unknown storage type" << stype;
+      break;
   }
   return num;
 }
 
 // Make a copy of a CSR NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
 inline void CopyFromToCsrImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
   using namespace mshadow;
   CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
@@ -1070,19 +1162,16 @@ inline void CopyFromToCsrImpl(const NDArray& from, const NDArray& to, RunContext
   to.CheckAndAllocAuxData(csr::kIndPtr, from.aux_shape(csr::kIndPtr));
   to.CheckAndAllocAuxData(csr::kIdx, from.aux_shape(csr::kIdx));
   to.CheckAndAllocData(from.aux_shape(csr::kIdx));
-  TBlob val = to.data();
+  TBlob val    = to.data();
   TBlob indptr = to.aux_data(csr::kIndPtr);
-  TBlob idx = to.aux_data(csr::kIdx);
-  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
-                                  from.ctx(), to.ctx(), ctx);
-  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr,
-                                  from.ctx(), to.ctx(), ctx);
-  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx,
-                                  from.ctx(), to.ctx(), ctx);
+  TBlob idx    = to.aux_data(csr::kIdx);
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val, from.ctx(), to.ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr, from.ctx(), to.ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx, from.ctx(), to.ctx(), ctx);
 }
 
 // Make a copy of a row-sparse NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
 inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
   using namespace mshadow;
   CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
@@ -1096,14 +1185,12 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext
   to.CheckAndAlloc({aux_shape});
   TBlob val = to.data();
   TBlob idx = to.aux_data(rowsparse::kIdx);
-  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
-                                  from.ctx(), to.ctx(), ctx);
-  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx,
-                                  from.ctx(), to.ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val, from.ctx(), to.ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx, from.ctx(), to.ctx(), ctx);
 }
 
 // Make a copy of a dense NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
 inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
 #if MXNET_USE_MKLDNN == 1
   // If neither is MKLDNN, we can copy data normally.
@@ -1112,23 +1199,19 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
     using namespace mshadow;
     CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
     TBlob tmp = to.data();
-    ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
-                                    from.ctx(), to.ctx(), ctx);
+    ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp, from.ctx(), to.ctx(), ctx);
 #if MXNET_USE_MKLDNN == 1
-  } else if (SupportMKLDNN(from.dtype(), from.shape())
-             && SupportMKLDNN(to.dtype(), to.shape())
-             && from.ctx().dev_mask() == cpu::kDevMask
-             && to.ctx().dev_mask() == cpu::kDevMask) {
-    // If we copy data directly, we need to make sure both NDArrays are supported
-    // by MKLDNN.
+  } else if (SupportMKLDNN(from.dtype(), from.shape()) && SupportMKLDNN(to.dtype(), to.shape()) &&
+             from.ctx().dev_mask() == cpu::kDevMask && to.ctx().dev_mask() == cpu::kDevMask) {
+    // If we copy data directly, we need to make sure both NDArrays are
+    // supported by MKLDNN.
     auto from_mem = from.GetMKLDNNData();
-    auto to_mem = to.GetMKLDNNData();
+    auto to_mem   = to.GetMKLDNNData();
     if (from_mem->get_desc() == to_mem->get_desc()) {
-      size_t size = std::min(from_mem->get_desc().get_size(),
-                             to_mem->get_desc().get_size());
+      size_t size = std::min(from_mem->get_desc().get_size(), to_mem->get_desc().get_size());
       memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
     } else {
-      const_cast<NDArray &>(to).CopyFrom(*from_mem);
+      const_cast<NDArray&>(to).CopyFrom(*from_mem);
       MKLDNNStream::Get()->Submit();
     }
   } else {
@@ -1138,7 +1221,7 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
     NDArray tmp_from = from;
     if (tmp_from.IsMKLDNNData()) {
       // TODO(zhengda) tmp_from should be cached.
-      tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype());
+      tmp_from     = NDArray(from.shape(), from.ctx(), false, from.dtype());
       auto tmp_mem = from.GetMKLDNNData();
       tmp_from.CopyFrom(*tmp_mem);
       MKLDNNStream::Get()->Submit();
@@ -1146,35 +1229,31 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
     CHECK(tmp_from.IsDefaultData());
     CHECK(to.IsDefaultData());
     TBlob tmp = to.data();
-    ndarray::Copy<from_xpu, to_xpu>(tmp_from.data(), &tmp,
-                                    from.ctx(), to.ctx(), ctx);
+    ndarray::Copy<from_xpu, to_xpu>(tmp_from.data(), &tmp, from.ctx(), to.ctx(), ctx);
   }
 #endif
 }
 
 // Make a copy of an NDArray based on storage type
-template<typename from_xpu, typename to_xpu>
-void CopyFromToImpl(const NDArray& from, const NDArray& to,
-                    RunContext rctx, const std::vector<Resource>& requested) {
+template <typename from_xpu, typename to_xpu>
+void CopyFromToImpl(const NDArray& from,
+                    const NDArray& to,
+                    RunContext rctx,
+                    const std::vector<Resource>& requested) {
   using namespace std;
   using namespace mshadow;
   // if storage type doesn't match, cast the storage first
   const NDArrayStorageType from_stype = from.storage_type();
-  const NDArrayStorageType to_stype = to.storage_type();
-  CHECK(from_stype == kDefaultStorage
-      || to_stype == kDefaultStorage
-      || from_stype == to_stype)
-    << "Copying ndarray of stype = " << from_stype
-    << " to stype = " << to_stype << " is not supported";
+  const NDArrayStorageType to_stype   = to.storage_type();
+  CHECK(from_stype == kDefaultStorage || to_stype == kDefaultStorage || from_stype == to_stype)
+      << "Copying ndarray of stype = " << from_stype << " to stype = " << to_stype
+      << " is not supported";
   const Context from_ctx = from.ctx();
-  const Context to_ctx = to.ctx();
-  bool is_train = Imperative::Get()->is_training();
-
-  OpContext opctx{Imperative::Get()->is_recording(),
-                  is_train,
-                  rctx,
-                  engine::CallbackOnComplete(),
-                  requested};
+  const Context to_ctx   = to.ctx();
+  bool is_train          = Imperative::Get()->is_training();
+
+  OpContext opctx{
+      Imperative::Get()->is_recording(), is_train, rctx, engine::CallbackOnComplete(), requested};
   if (from_ctx == to_ctx && from_stype != to_stype) {
     // same ctx, different stypes, use cast op directly without copying
     common::CastStorageDispatch<from_xpu>(opctx, from, to);
@@ -1182,7 +1261,7 @@ void CopyFromToImpl(const NDArray& from, const NDArray& to,
     NDArray casted_nd;  // an intermediate result before copying from to to
     if (from_stype == to_stype) {
       casted_nd = from;  // same stype, no need to cast from
-    } else {  // different stypes on different ctx needs an temporary casted_nd
+    } else {             // different stypes on different ctx needs an temporary casted_nd
       const mxnet::TShape& shape = from.shape();
       if (to_stype == kDefaultStorage) {
         casted_nd = NDArray(shape, from_ctx);
@@ -1213,21 +1292,21 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
   CHECK(from.shape() == to.shape())
       << "operands shape mismatch "
       << "from.shape = " << from.shape() << " to.shape=" << to.shape();
-  CHECK(!mxnet::op::shape_is_none(from.shape()))
-      << "source operands have undefined shape";
+  CHECK(!mxnet::op::shape_is_none(from.shape())) << "source operands have undefined shape";
   // zero-size array, no need to copy
   if (from.shape().Size() == 0U) {
     return;
   }
   // important: callback must always capture by value
   const Context from_ctx = from.ctx();
-  const int a = from_ctx.dev_mask();
-  const int b = to.ctx().dev_mask();
+  const int a            = from_ctx.dev_mask();
+  const int b            = to.ctx().dev_mask();
   std::vector<Engine::VarHandle> const_vars;
-  if (from.var() != to.var()) const_vars.push_back(from.var());
+  if (from.var() != to.var())
+    const_vars.push_back(from.var());
 
   const NDArrayStorageType from_stype = from.storage_type();
-  const NDArrayStorageType to_stype = to.storage_type();
+  const NDArrayStorageType to_stype   = to.storage_type();
 
   std::vector<Engine::VarHandle> mutable_vars(1, to.var());
 
@@ -1250,8 +1329,8 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
 
     // request temp resource if cast_storage performs on GPU
     if (a == gpu::kDevMask) {
-      Resource rsc = ResourceManager::Get()->Request(from_ctx,
-          ResourceRequest(ResourceRequest::kTempSpace));
+      Resource rsc =
+          ResourceManager::Get()->Request(from_ctx, ResourceRequest(ResourceRequest::kTempSpace));
       requested.push_back(rsc);
       mutable_vars.push_back(rsc.var);
     }
@@ -1259,38 +1338,57 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
 
   if (a == cpu::kDevMask && b == cpu::kDevMask) {
     Engine::Get()->PushAsync(
-      [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
-        CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
-        on_complete();
-      }, from.ctx(), const_vars, mutable_vars,
-      FnProperty::kNormal, priority, "CopyCPU2CPU");
+        [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+          CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
+          on_complete();
+        },
+        from.ctx(),
+        const_vars,
+        mutable_vars,
+        FnProperty::kNormal,
+        priority,
+        "CopyCPU2CPU");
   } else {
 #if MXNET_USE_CUDA
     if (a == cpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushAsync(
-        [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
-          CopyFromToImpl<cpu, gpu>(from, to, ctx, requested);
-          ctx.get_stream<gpu>()->Wait();
-          on_complete();
-        }, to.ctx(), const_vars, mutable_vars,
-        FnProperty::kCopyToGPU, priority, "CopyCPU2GPU");
+          [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+            CopyFromToImpl<cpu, gpu>(from, to, ctx, requested);
+            ctx.get_stream<gpu>()->Wait();
+            on_complete();
+          },
+          to.ctx(),
+          const_vars,
+          mutable_vars,
+          FnProperty::kCopyToGPU,
+          priority,
+          "CopyCPU2GPU");
     } else if (a == gpu::kDevMask && b == cpu::kDevMask) {
       Engine::Get()->PushAsync(
-        [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
-          CopyFromToImpl<gpu, cpu>(from, to, ctx, requested);
-          ctx.get_stream<gpu>()->Wait();
-          on_complete();
-        }, from.ctx(), const_vars, mutable_vars,
-        FnProperty::kCopyFromGPU, priority, "CopyGPU2CPU");
+          [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+            CopyFromToImpl<gpu, cpu>(from, to, ctx, requested);
+            ctx.get_stream<gpu>()->Wait();
+            on_complete();
+          },
+          from.ctx(),
+          const_vars,
+          mutable_vars,
+          FnProperty::kCopyFromGPU,
+          priority,
+          "CopyGPU2CPU");
     } else if (a == gpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushAsync(
-        [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
-          CopyFromToImpl<gpu, gpu>(from, to, ctx, requested);
-          ctx.get_stream<gpu>()->Wait();
-          on_complete();
-        }, from.ctx(), const_vars, mutable_vars,
-        from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
-        priority, is_opr ? "_copyto_GPU2GPU" : "CopyGPU2GPU");
+          [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+            CopyFromToImpl<gpu, gpu>(from, to, ctx, requested);
+            ctx.get_stream<gpu>()->Wait();
+            on_complete();
+          },
+          from.ctx(),
+          const_vars,
+          mutable_vars,
+          from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+          priority,
+          is_opr ? "_copyto_GPU2GPU" : "CopyGPU2GPU");
     } else {
       LOG(FATAL) << "unknown device mask";
     }
@@ -1300,26 +1398,22 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
   }
 }
 
-
-void CopyFromTo(const NDArray& from, const NDArray *to, int priority) {
+void CopyFromTo(const NDArray& from, const NDArray* to, int priority) {
   CopyFromTo(from, *to, priority);
 }
 
-void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority) {
+void ElementwiseSum(const std::vector<NDArray>& source, NDArray* out, int priority) {
   std::vector<Engine::VarHandle> const_vars;
   const_vars.reserve(source.size());
   for (const auto& source_array : source) {
     if (source_array.var() != out->var()) {
       const_vars.push_back(source_array.var());
     }
-    CHECK_EQ(source_array.shape() , out->shape())
-        << "operands shape mismatch";
+    CHECK_EQ(source_array.shape(), out->shape()) << "operands shape mismatch";
     if (out->ctx().dev_mask() == Context::kCPU) {
-      CHECK_EQ(source_array.ctx().dev_mask(), Context::kCPU)
-          << "operands context mismatch";
+      CHECK_EQ(source_array.ctx().dev_mask(), Context::kCPU) << "operands context mismatch";
     } else {
-      CHECK_EQ(source_array.ctx(), out->ctx())
-          << "operands context mismatch";
+      CHECK_EQ(source_array.ctx(), out->ctx()) << "operands context mismatch";
     }
   }
   // important: callback must always capture by value
@@ -1330,67 +1424,84 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
   if (stype == kDefaultStorage) {
     switch (out->ctx().dev_mask()) {
       case cpu::kDevMask: {
-        Engine::Get()->PushSync([source, ret](RunContext ctx) {
-            std::vector<TBlob> source_tblob(source.size());
-            for (size_t i = 0; i < source.size(); ++i) {
-              source_tblob[i] = source[i].data();
-            }
-            TBlob tmp = ret.data();
-            ndarray::ElementwiseSum<cpu>(source_tblob, &tmp, ctx);
-          }, out->ctx(), const_vars, {ret.var()},
-          FnProperty::kNormal, priority, PROFILER_MESSAGE_FUNCNAME);
+        Engine::Get()->PushSync(
+            [source, ret](RunContext ctx) {
+              std::vector<TBlob> source_tblob(source.size());
+              for (size_t i = 0; i < source.size(); ++i) {
+                source_tblob[i] = source[i].data();
+              }
+              TBlob tmp = ret.data();
+              ndarray::ElementwiseSum<cpu>(source_tblob, &tmp, ctx);
+            },
+            out->ctx(),
+            const_vars,
+            {ret.var()},
+            FnProperty::kNormal,
+            priority,
+            PROFILER_MESSAGE_FUNCNAME);
         break;
       }
 #if MXNET_USE_CUDA
       case gpu::kDevMask: {
-        Engine::Get()->PushSync([source, ret](RunContext ctx) {
-            std::vector<TBlob> source_tblob(source.size());
-            for (size_t i = 0; i < source.size(); ++i) {
-              source_tblob[i] = source[i].data();
-            }
-            TBlob tmp = ret.data();
-            ndarray::ElementwiseSum<gpu>(source_tblob, &tmp, ctx);
-            // Wait GPU kernel to complete
-            ctx.get_stream<gpu>()->Wait();
-          }, out->ctx(), const_vars, {ret.var()},
-          FnProperty::kNormal, priority, "DenseElementwiseSum");
+        Engine::Get()->PushSync(
+            [source, ret](RunContext ctx) {
+              std::vector<TBlob> source_tblob(source.size());
+              for (size_t i = 0; i < source.size(); ++i) {
+                source_tblob[i] = source[i].data();
+              }
+              TBlob tmp = ret.data();
+              ndarray::ElementwiseSum<gpu>(source_tblob, &tmp, ctx);
+              // Wait GPU kernel to complete
+              ctx.get_stream<gpu>()->Wait();
+            },
+            out->ctx(),
+            const_vars,
+            {ret.var()},
+            FnProperty::kNormal,
+            priority,
+            "DenseElementwiseSum");
         break;
       }
 #endif
-      default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      default:
+        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
     }
   } else if (stype == kRowSparseStorage) {
-    Resource rsc = ResourceManager::Get()->Request(ret.ctx(),
-      ResourceRequest(ResourceRequest::kTempSpace));
+    Resource rsc =
+        ResourceManager::Get()->Request(ret.ctx(), ResourceRequest(ResourceRequest::kTempSpace));
 
     Engine::Get()->PushSync(
-      [source, ret, rsc](RunContext rctx) {
-        NDArray result = ret;
-        switch (ret.ctx().dev_mask()) {
-          case cpu::kDevMask: {
-            mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, source, &result);
-            break;
-          }
+        [source, ret, rsc](RunContext rctx) {
+          NDArray result = ret;
+          switch (ret.ctx().dev_mask()) {
+            case cpu::kDevMask: {
+              mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, source, &result);
+              break;
+            }
 #if MXNET_USE_CUDA
-          case gpu::kDevMask: {
-            mxnet::ndarray::ElementwiseSum(rctx.get_stream<gpu>(), rsc, source, &result);
-            // wait for GPU operations to complete
-            rctx.get_stream<gpu>()->Wait();
-            break;
-          }
+            case gpu::kDevMask: {
+              mxnet::ndarray::ElementwiseSum(rctx.get_stream<gpu>(), rsc, source, &result);
+              // wait for GPU operations to complete
+              rctx.get_stream<gpu>()->Wait();
+              break;
+            }
 #endif
-          default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-        }
-      }, ret.ctx(), const_vars, {ret.var(), rsc.var},
-    FnProperty::kNormal, priority, "RowSparseElementwiseSum");
+            default:
+              LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+          }
+        },
+        ret.ctx(),
+        const_vars,
+        {ret.var(), rsc.var},
+        FnProperty::kNormal,
+        priority,
+        "RowSparseElementwiseSum");
   } else {
     LOG(FATAL) << "Not implemented for storage_type " << common::stype_string(stype);
   }
 }
 
-void ClipOp(const NDArray &src,
-            const real_t &a_min, const real_t &a_max,
-            NDArray *out) {
+void ClipOp(const NDArray& src, const real_t& a_min, const real_t& a_max, NDArray* out) {
   if (out->is_none()) {
     *out = NDArray(src.shape(), src.ctx(), true, src.dtype());
   } else {
@@ -1399,99 +1510,123 @@ void ClipOp(const NDArray &src,
   }
   NDArray ret = *out;
   std::vector<Engine::VarHandle> const_vars;
-  if (src.var() != ret.var()) const_vars.push_back(src.var());
+  if (src.var() != ret.var())
+    const_vars.push_back(src.var());
   switch (src.ctx().dev_mask()) {
     case cpu::kDevMask: {
-      Engine::Get()->PushSync([src, a_min, a_max, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::EvalClip<cpu>(src.data(), a_min, a_max, &tmp, ctx);
-        }, src.ctx(), const_vars, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [src, a_min, a_max, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::EvalClip<cpu>(src.data(), a_min, a_max, &tmp, ctx);
+          },
+          src.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
-    #if MXNET_USE_CUDA
+#if MXNET_USE_CUDA
     case gpu::kDevMask: {
-      Engine::Get()->PushSync([src, a_min, a_max, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::EvalClip<gpu>(src.data(), a_min, a_max, &tmp, ctx);
-        }, src.ctx(), const_vars, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [src, a_min, a_max, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::EvalClip<gpu>(src.data(), a_min, a_max, &tmp, ctx);
+          },
+          src.ctx(),
+          const_vars,
+          {ret.var()},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
-    #endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    default:
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }
 
-template<typename Distribution>
-void SampleOP(const real_t &a,
-              const real_t &b,
-              NDArray *out) {
+template <typename Distribution>
+void SampleOP(const real_t& a, const real_t& b, NDArray* out) {
   CHECK(!out->is_none());
-  Resource resource = ResourceManager::Get()->Request(
-      out->ctx(), ResourceRequest::kRandom);
+  Resource resource = ResourceManager::Get()->Request(out->ctx(), ResourceRequest::kRandom);
   // important: callback must always capture by value
   NDArray ret = *out;
   // redirect everything to mshadow operations
   switch (out->ctx().dev_mask()) {
     case cpu::kDevMask: {
-      Engine::Get()->PushSync([a, b, resource, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::EvalRandom<cpu, Distribution>(a, b, resource, &tmp, ctx);
-        }, out->ctx(), {}, {ret.var(), resource.var},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [a, b, resource, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::EvalRandom<cpu, Distribution>(a, b, resource, &tmp, ctx);
+          },
+          out->ctx(),
+          {},
+          {ret.var(), resource.var},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
     case gpu::kDevMask: {
-      Engine::Get()->PushSync([a, b, resource, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::EvalRandom<gpu, Distribution>(a, b, resource, &tmp, ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-        }, out->ctx(), {}, {ret.var(), resource.var},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+      Engine::Get()->PushSync(
+          [a, b, resource, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::EvalRandom<gpu, Distribution>(a, b, resource, &tmp, ctx);
+            // Wait GPU kernel to complete
+            ctx.get_stream<gpu>()->Wait();
+          },
+          out->ctx(),
+          {},
+          {ret.var(), resource.var},
+          FnProperty::kNormal,
+          0,
+          PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    default:
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
   }
 }
 
-void SampleUniform(real_t begin, real_t end, NDArray *out) {
+void SampleUniform(real_t begin, real_t end, NDArray* out) {
   SampleOP<ndarray::UniformDistribution>(begin, end, out);
 }
 
-void SampleGaussian(real_t mu, real_t sigma, NDArray *out) {
+void SampleGaussian(real_t mu, real_t sigma, NDArray* out) {
   SampleOP<ndarray::GaussianDistribution>(mu, sigma, out);
 }
 
-void SampleExponential(real_t lambda, NDArray *out) {
-  if ( out->ctx().dev_mask() != cpu::kDevMask ) {
-    LOG(FATAL) <<"exponential sampling only valid on cpu";
+void SampleExponential(real_t lambda, NDArray* out) {
+  if (out->ctx().dev_mask() != cpu::kDevMask) {
+    LOG(FATAL) << "exponential sampling only valid on cpu";
   }
   real_t dummy;
   SampleOP<ndarray::ExponentialDistribution>(lambda, dummy, out);
 }
 
-void SamplePoisson(real_t lambda, NDArray *out) {
-  if ( out->ctx().dev_mask() != cpu::kDevMask ) {
-    LOG(FATAL) <<"poisson sampling only valid on cpu";
+void SamplePoisson(real_t lambda, NDArray* out) {
+  if (out->ctx().dev_mask() != cpu::kDevMask) {
+    LOG(FATAL) << "poisson sampling only valid on cpu";
   }
   real_t dummy;
   SampleOP<ndarray::PoissonDistribution>(lambda, dummy, out);
 }
 
-void SampleNegBinomial(int32_t k, real_t p, NDArray *out) {
-  if ( out->ctx().dev_mask() != cpu::kDevMask ) {
-    LOG(FATAL) <<"negative binomial sampling only valid on cpu";
+void SampleNegBinomial(int32_t k, real_t p, NDArray* out) {
+  if (out->ctx().dev_mask() != cpu::kDevMask) {
+    LOG(FATAL) << "negative binomial sampling only valid on cpu";
   }
   SampleOP<ndarray::NegBinomialDistribution>(k, p, out);
 }
 
-void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray *out) {
-  if ( out->ctx().dev_mask() != cpu::kDevMask ) {
-    LOG(FATAL) <<"negative binomial sampling only valid on cpu";
+void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray* out) {
+  if (out->ctx().dev_mask() != cpu::kDevMask) {
+    LOG(FATAL) << "negative binomial sampling only valid on cpu";
   }
   SampleOP<ndarray::GenNegBinomialDistribution>(mu, alpha, out);
 }
@@ -1504,92 +1639,88 @@ void RandomSeed(Context ctx, uint32_t seed) {
   ResourceManager::Get()->SeedRandom(ctx, seed);
 }
 
-template<typename OP>
-inline NDArray BinaryOpRet(const NDArray &lhs,
-                           const NDArray &rhs) {
+template <typename OP>
+inline NDArray BinaryOpRet(const NDArray& lhs, const NDArray& rhs) {
   NDArray ret;
   BinaryOpKernel<OP>(lhs, rhs, &ret);
   return ret;
 }
 
-template<typename OP, bool reverse>
-inline NDArray ScalarOpRet(const NDArray &lhs,
-                           const real_t &rhs) {
+template <typename OP, bool reverse>
+inline NDArray ScalarOpRet(const NDArray& lhs, const real_t& rhs) {
   NDArray ret;
   ScalarOp<OP, reverse>(lhs, rhs, &ret);
   return ret;
 }
 
-template<typename OP>
-inline NDArray &BinaryOpApply(NDArray *dst,
-                              const NDArray &src) {
+template <typename OP>
+inline NDArray& BinaryOpApply(NDArray* dst, const NDArray& src) {
   BinaryOpKernel<OP>(*dst, src, dst);
   return *dst;
 }
 
-template<typename OP>
-inline NDArray &ScalarOpApply(NDArray *dst,
-                             const real_t &src) {
+template <typename OP>
+inline NDArray& ScalarOpApply(NDArray* dst, const real_t& src) {
   ScalarOp<OP, false>(*dst, src, dst);
   return *dst;
 }
 
 // Binary
-NDArray operator+(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator+(const NDArray& lhs, const NDArray& rhs) {
   return BinaryOpRet<ndarray::Plus>(lhs, rhs);
 }
-NDArray operator-(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator-(const NDArray& lhs, const NDArray& rhs) {
   return BinaryOpRet<ndarray::Minus>(lhs, rhs);
 }
-NDArray operator*(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator*(const NDArray& lhs, const NDArray& rhs) {
   return BinaryOpRet<ndarray::Mul>(lhs, rhs);
 }
-NDArray operator/(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator/(const NDArray& lhs, const NDArray& rhs) {
   return BinaryOpRet<ndarray::Div>(lhs, rhs);
 }
 // Scalar
-NDArray operator+(const NDArray &lhs, const real_t &rhs) {
+NDArray operator+(const NDArray& lhs, const real_t& rhs) {
   return ScalarOpRet<ndarray::Plus, false>(lhs, rhs);
 }
-NDArray operator-(const NDArray &lhs, const real_t &rhs) {
+NDArray operator-(const NDArray& lhs, const real_t& rhs) {
   return ScalarOpRet<ndarray::Minus, false>(lhs, rhs);
 }
-NDArray operator*(const NDArray &lhs, const real_t &rhs) {
+NDArray operator*(const NDArray& lhs, const real_t& rhs) {
   return ScalarOpRet<ndarray::Mul, false>(lhs, rhs);
 }
-NDArray operator/(const NDArray &lhs, const real_t &rhs) {
+NDArray operator/(const NDArray& lhs, const real_t& rhs) {
   return ScalarOpRet<ndarray::Div, false>(lhs, rhs);
 }
 
 // Binary
-NDArray &NDArray::operator=(real_t scalar) {
+NDArray& NDArray::operator=(real_t scalar) {
   SetValueOp(scalar, this);
   return *this;
 }
 
-NDArray &NDArray::operator+=(const NDArray &src) {
+NDArray& NDArray::operator+=(const NDArray& src) {
   return BinaryOpApply<ndarray::Plus>(this, src);
 }
-NDArray &NDArray::operator-=(const NDArray &src) {
+NDArray& NDArray::operator-=(const NDArray& src) {
   return BinaryOpApply<ndarray::Minus>(this, src);
 }
-NDArray &NDArray::operator*=(const NDArray &src) {
+NDArray& NDArray::operator*=(const NDArray& src) {
   return BinaryOpApply<ndarray::Mul>(this, src);
 }
-NDArray &NDArray::operator/=(const NDArray &src) {
+NDArray& NDArray::operator/=(const NDArray& src) {
   return BinaryOpApply<ndarray::Div>(this, src);
 }
 // Scalar
-NDArray &NDArray::operator+=(const real_t &src) {
+NDArray& NDArray::operator+=(const real_t& src) {
   return ScalarOpApply<ndarray::Plus>(this, src);
 }
-NDArray &NDArray::operator-=(const real_t &src) {
+NDArray& NDArray::operator-=(const real_t& src) {
   return ScalarOpApply<ndarray::Minus>(this, src);
 }
-NDArray &NDArray::operator*=(const real_t &src) {
+NDArray& NDArray::operator*=(const real_t& src) {
   return ScalarOpApply<ndarray::Mul>(this, src);
 }
-NDArray &NDArray::operator/=(const real_t &src) {
+NDArray& NDArray::operator/=(const real_t& src) {
   return ScalarOpApply<ndarray::Div>(this, src);
 }
 
@@ -1603,10 +1734,11 @@ static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9;
 // The ndarray must be saved and loaded within np shape semantics.
 static const uint32_t NDARRAY_V3_MAGIC = 0xF993faca;
 
-void NDArray::Save(dmlc::Stream *strm) const {
+void NDArray::Save(dmlc::Stream* strm) const {
   if (Imperative::Get()->is_np_shape()) {
     CHECK_EQ(storage_type(), kDefaultStorage)
-        << "only allow serializing ndarray of default storage type in np shape semantics";
+        << "only allow serializing ndarray of default storage type in np shape "
+           "semantics";
     strm->Write(NDARRAY_V3_MAGIC);
   } else {
     // write magic number to mark this version
@@ -1626,7 +1758,8 @@ void NDArray::Save(dmlc::Stream *strm) const {
 
   // save shape
   shape_.Save(strm);
-  if (is_none()) return;
+  if (is_none())
+    return;
 
   // save context
   Context ctx = this->ctx();
@@ -1679,66 +1812,82 @@ void NDArray::Save(dmlc::Stream *strm) const {
   }
 }
 
-bool LegacyTShapeLoad(dmlc::Stream *strm, mxnet::TShape *shape, const uint32_t magic) {
+bool LegacyTShapeLoad(dmlc::Stream* strm, mxnet::TShape* shape, const uint32_t magic) {
   switch (magic) {
     case NDARRAY_V1_MAGIC:
       return shape->Load(strm);
     default:
       // meet legacy mxnet::TShape, magic is ndim here
       uint32_t ndim = magic;
-      *shape = mxnet::TShape(ndim, -1);
+      *shape        = mxnet::TShape(ndim, -1);
       std::vector<uint32_t> buffer(ndim);
       size_t nread = ndim * sizeof(uint32_t);
-      if (strm->Read(buffer.data(), nread) != nread) return false;
+      if (strm->Read(buffer.data(), nread) != nread)
+        return false;
       nnvm::ShapeTypeCast(buffer.begin(), buffer.end(), shape->begin());
       return true;
   }
 }
 
-bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) {
+bool NDArray::LegacyLoad(dmlc::Stream* strm, const uint32_t magic) {
   // load shape
   mxnet::TShape shape;
-  if (!LegacyTShapeLoad(strm, &shape, magic)) return false;
+  if (!LegacyTShapeLoad(strm, &shape, magic))
+    return false;
   if (mxnet::op::shape_is_none(shape)) {
-    *this = NDArray(); return true;
+    *this = NDArray();
+    return true;
   }
   // load context
   Context ctx;
-  if (!ctx.Load(strm)) return false;
+  if (!ctx.Load(strm))
+    return false;
   // load type flag
   int32_t type_flag;
-  if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
+  if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag))
+    return false;
   // load data into CPU
   NDArray temp(shape, Context::CPU(), false, type_flag);
-  TBlob load_data = temp.data();
+  TBlob load_data  = temp.data();
   size_t type_size = mshadow::mshadow_sizeof(type_flag);
-  size_t nread = type_size * shape.Size();
+  size_t nread     = type_size * shape.Size();
 
-  if (strm->Read(load_data.dptr_, nread) != nread) return false;
+  if (strm->Read(load_data.dptr_, nread) != nread)
+    return false;
   if (ctx.dev_mask() == cpu::kDevMask) {
-    *this = std::move(temp); return true;
+    *this = std::move(temp);
+    return true;
   } else {
 #if MXNET_USE_CUDA
-    *this = temp.Copy(ctx); return true;
+    *this = temp.Copy(ctx);
+    return true;
 #else
-    *this = std::move(temp); return true;
+    *this = std::move(temp);
+    return true;
 #endif
   }
 }
 
-bool NDArray::Load(dmlc::Stream *strm) {
+bool NDArray::Load(dmlc::Stream* strm) {
   uint32_t magic;
-  if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false;
+  if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t))
+    return false;
   if (magic == NDARRAY_V3_MAGIC) {
     CHECK(Imperative::Get()->is_np_shape())
-        << "ndarray was saved in np shape semantics, must be loaded in the same semantics."
-           " Please turn on np shape semantics in Python using `with np_shape(True)`"
-           " or decorator `use_np_shape` to scope the code of loading the ndarray.";
+        << "ndarray was saved in np shape semantics, must be loaded in the "
+           "same semantics."
+           " Please turn on np shape semantics in Python using `with "
+           "np_shape(True)`"
+           " or decorator `use_np_shape` to scope the code of loading the "
+           "ndarray.";
   } else {
-    // when the flag is global on, skip the check since it would be always global on.
+    // when the flag is global on, skip the check since it would be always
+    // global on.
     CHECK(Imperative::Get()->is_np_shape() == GlobalOn || !Imperative::Get()->is_np_shape())
-        << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
-           " Please turn off np shape semantics in Python using `with np_shape(False)`"
+        << "ndarray was not saved in np shape semantics, but being loaded in "
+           "np shape semantics."
+           " Please turn off np shape semantics in Python using `with "
+           "np_shape(False)`"
            " to scope the code of loading the ndarray.";
   }
   if (magic != NDARRAY_V2_MAGIC && magic != NDARRAY_V3_MAGIC) {
@@ -1747,38 +1896,45 @@ bool NDArray::Load(dmlc::Stream *strm) {
 
   // load storage type
   int32_t stype;
-  if (strm->Read(&stype, sizeof(stype)) != sizeof(stype)) return false;
+  if (strm->Read(&stype, sizeof(stype)) != sizeof(stype))
+    return false;
   if (Imperative::Get()->is_np_shape()) {
     CHECK_EQ(stype, kDefaultStorage)
-        << "only allow deserializing ndarray of default storage type in np shape semantics";
+        << "only allow deserializing ndarray of default storage type in np "
+           "shape semantics";
   }
   const int32_t nad = num_aux_data(static_cast<NDArrayStorageType>(stype));
 
   // load storage shape
   mxnet::TShape sshape;
   if (nad > 0) {
-    if (!sshape.Load(strm)) return false;
+    if (!sshape.Load(strm))
+      return false;
   }
 
   // load shape
   mxnet::TShape shape;
-  if (!shape.Load(strm)) return false;
+  if (!shape.Load(strm))
+    return false;
   if (Imperative::Get()->is_np_shape()) {
     if (!shape_is_known(shape)) {
       *this = NDArray();
       return true;
     }
   } else if (shape.ndim() == 0) {
-    *this = NDArray(); return true;
+    *this = NDArray();
+    return true;
   }
 
   // load context
   Context ctx;
-  if (!ctx.Load(strm)) return false;
+  if (!ctx.Load(strm))
+    return false;
 
   // load type flag
   int32_t type_flag;
-  if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
+  if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag))
+    return false;
 
   // load aux_types and aux_shapes
   std::vector<int32_t> aux_types;
@@ -1788,9 +1944,11 @@ bool NDArray::Load(dmlc::Stream *strm) {
     aux_shapes.resize(nad);
     for (int i = 0; i < nad; ++i) {
       // load aux_type(i)
-      if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i])) return false;
+      if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i]))
+        return false;
       // load aux_shapes(i)
-      if (!aux_shapes[i].Load(strm)) return false;
+      if (!aux_shapes[i].Load(strm))
+        return false;
     }
   }
 
@@ -1799,39 +1957,50 @@ bool NDArray::Load(dmlc::Stream *strm) {
   if (0 == nad) {
     temp = NDArray(shape, Context::CPU(), false, type_flag);
   } else {
-    temp = NDArray(static_cast<NDArrayStorageType>(stype), shape,
-                   Context::CPU(), false, type_flag,
-                   aux_types, aux_shapes, sshape);
+    temp = NDArray(static_cast<NDArrayStorageType>(stype),
+                   shape,
+                   Context::CPU(),
+                   false,
+                   type_flag,
+                   aux_types,
+                   aux_shapes,
+                   sshape);
   }
   // load data
-  TBlob load_data = temp.data();
+  TBlob load_data  = temp.data();
   size_t type_size = mshadow::mshadow_sizeof(type_flag);
-  size_t nread = type_size * load_data.Size();
-  if (strm->Read(load_data.dptr_, nread) != nread) return false;
+  size_t nread     = type_size * load_data.Size();
+  if (strm->Read(load_data.dptr_, nread) != nread)
+    return false;
 
   // load aux_data
   if (nad > 0) {
     for (int i = 0; i < nad; ++i) {
       load_data = temp.aux_data(i);
       type_size = mshadow::mshadow_sizeof(load_data.type_flag_);
-      nread = type_size * load_data.Size();
-      if (strm->Read(load_data.dptr_, nread) != nread) return false;
+      nread     = type_size * load_data.Size();
+      if (strm->Read(load_data.dptr_, nread) != nread)
+        return false;
     }
   }
 
   if (ctx.dev_mask() == cpu::kDevMask) {
-    *this = std::move(temp); return true;
+    *this = std::move(temp);
+    return true;
   } else {
 #if MXNET_USE_CUDA
     int device_count = -1;
     cudaGetDeviceCount(&device_count);
     if (device_count > 0) {
-      *this = temp.Copy(ctx); return true;
+      *this = temp.Copy(ctx);
+      return true;
     } else {
-      *this = std::move(temp); return true;
+      *this = std::move(temp);
+      return true;
     }
 #else
-    *this = std::move(temp); return true;
+    *this = std::move(temp);
+    return true;
 #endif
   }
 }
@@ -1848,22 +2017,14 @@ void NDArray::Save(dmlc::Stream* fo,
   fo->Write(names);
 }
 
-void NDArray::Load(dmlc::Stream* fi,
-                   std::vector<NDArray>* data,
-                   std::vector<std::string>* keys) {
+void NDArray::Load(dmlc::Stream* fi, std::vector<NDArray>* data, std::vector<std::string>* keys) {
   uint64_t header, reserved;
-  CHECK(fi->Read(&header))
-      << "Invalid NDArray file format";
-  CHECK(fi->Read(&reserved))
-      << "Invalid NDArray file format";
-  CHECK(header == kMXAPINDArrayListMagic)
-      << "Invalid NDArray file format";
-  CHECK(fi->Read(data))
-      << "Invalid NDArray file format";
-  CHECK(fi->Read(keys))
-      << "Invalid NDArray file format";
-  CHECK(keys->size() == 0 || keys->size() == data->size())
-      << "Invalid NDArray file format";
+  CHECK(fi->Read(&header)) << "Invalid NDArray file format";
+  CHECK(fi->Read(&reserved)) << "Invalid NDArray file format";
+  CHECK(header == kMXAPINDArrayListMagic) << "Invalid NDArray file format";
+  CHECK(fi->Read(data)) << "Invalid NDArray file format";
+  CHECK(fi->Read(keys)) << "Invalid NDArray file format";
+  CHECK(keys->size() == 0 || keys->size() == data->size()) << "Invalid NDArray file format";
 }
 
 NDArray NDArray::Copy(Context ctx) const {
@@ -1871,30 +2032,37 @@ NDArray NDArray::Copy(Context ctx) const {
   if (kDefaultStorage == storage_type()) {
     ret = NDArray(shape(), ctx, true, dtype_);
   } else if (kUndefinedStorage != storage_type()) {
-    ret = NDArray(storage_type(), shape(), ctx, true, dtype_,
-                  ptr_->aux_types, ptr_->aux_shapes, storage_shape());
+    ret = NDArray(storage_type(),
+                  shape(),
+                  ctx,
+                  true,
+                  dtype_,
+                  ptr_->aux_types,
+                  ptr_->aux_shapes,
+                  storage_shape());
   } else {
-    LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type="
+    LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to "
+                  "ctx.dev_type="
                << ctx.dev_type << ", ctx.dev_id=" << ctx.dev_id;
   }
   CopyFromTo(*this, ret);
   return ret;
 }
 
-void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
+void NDArray::SyncCopyFromCPU(const void* data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
-    CHECK_LT(size, (int64_t{1} << 31) - 1) <<
-              "[SyncCopyFromCPU] Size of tensor you are trying to allocate is larger than "
-              "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+    CHECK_LT(size, (int64_t{1} << 31) - 1)
+        << "[SyncCopyFromCPU] Size of tensor you are trying to allocate is "
+           "larger than "
+           "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
   }
-  CHECK_EQ(dshape.Size(), size)
-      << "Memory size do not match";
+  CHECK_EQ(dshape.Size(), size) << "Memory size do not match";
   // zero-size array, no need to copy
   if (size == 0U) {
     return;
   }
-  TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
+  TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0);  // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
     this->WaitToWrite();
@@ -1904,15 +2072,19 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushAsync(
-      [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-        TBlob dst = this->data();
-        ndarray::Copy<cpu, gpu>(src, &dst,
-                                Context::CPU(), this->ctx(), rctx);
-        // Wait GPU kernel to complete
-        rctx.get_stream<gpu>()->Wait();
-        on_complete();
-      }, this->ctx(), {}, {this->var()},
-      FnProperty::kCopyToGPU, 0, "SyncCopyCPU2GPU");
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          TBlob dst = this->data();
+          ndarray::Copy<cpu, gpu>(src, &dst, Context::CPU(), this->ctx(), rctx);
+          // Wait GPU kernel to complete
+          rctx.get_stream<gpu>()->Wait();
+          on_complete();
+        },
+        this->ctx(),
+        {},
+        {this->var()},
+        FnProperty::kCopyToGPU,
+        0,
+        "SyncCopyCPU2GPU");
     this->WaitToRead();
 #else
     LOG(FATAL) << "GPU is not enabled";
@@ -1958,51 +2130,71 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
         this->CheckAndAllocAuxData(j, src_shape);
       }
     }
-    TBlob dst_data = (j >= 0? this->aux_data(j) : this->data());
+    TBlob dst_data = (j >= 0 ? this->aux_data(j) : this->data());
     CHECK_LE(src_shape.Size(), dst_data.shape_.Size());
     return dst_data;
   };
 
   if (src_dev_mask == cpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
-    Engine::Get()->PushSync([&](RunContext rctx) {
-        const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
-        TBlob dst_data = get_dst_data(src_data.shape_);
-        ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
-      }, this->ctx(), const_vars, {this->var()},
-      FnProperty::kNormal, 0, "SyncCopyFromNDArrayCPU2CPU");
+    Engine::Get()->PushSync(
+        [&](RunContext rctx) {
+          const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+          TBlob dst_data       = get_dst_data(src_data.shape_);
+          ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+        },
+        this->ctx(),
+        const_vars,
+        {this->var()},
+        FnProperty::kNormal,
+        0,
+        "SyncCopyFromNDArrayCPU2CPU");
   } else {
 #if MXNET_USE_CUDA
     if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
       Engine::Get()->PushAsync(
-        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-          const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
-          TBlob dst_data = get_dst_data(src_data.shape_);
-          ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
-          rctx.get_stream<gpu>()->Wait();
-          on_complete();
-        }, this->ctx(), const_vars, {this->var()},
-        FnProperty::kCopyToGPU, 0, "SyncCopyFromNDArrayCPU2GPU");
+          [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+            const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+            TBlob dst_data       = get_dst_data(src_data.shape_);
+            ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+            rctx.get_stream<gpu>()->Wait();
+            on_complete();
+          },
+          this->ctx(),
+          const_vars,
+          {this->var()},
+          FnProperty::kCopyToGPU,
+          0,
+          "SyncCopyFromNDArrayCPU2GPU");
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
       Engine::Get()->PushAsync(
-        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-          const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
-          TBlob dst_data = get_dst_data(src_data.shape_);
-          ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
-          rctx.get_stream<gpu>()->Wait();
-          on_complete();
-        }, src.ctx(), const_vars, {this->var()},
-        FnProperty::kCopyFromGPU, 0, "SyncCopyFromNDArrayGPU2CPU");
+          [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+            const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+            TBlob dst_data       = get_dst_data(src_data.shape_);
+            ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+            rctx.get_stream<gpu>()->Wait();
+            on_complete();
+          },
+          src.ctx(),
+          const_vars,
+          {this->var()},
+          FnProperty::kCopyFromGPU,
+          0,
+          "SyncCopyFromNDArrayGPU2CPU");
     } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
       Engine::Get()->PushAsync(
-        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-          const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
-          TBlob dst_data = get_dst_data(src_data.shape_);
-          ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
-          rctx.get_stream<gpu>()->Wait();
-          on_complete();
-        }, this->ctx(), const_vars, {this->var()},
-        src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
-        0, "SyncCopyFromNDArrayGPU2GPU");
+          [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+            const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+            TBlob dst_data       = get_dst_data(src_data.shape_);
+            ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+            rctx.get_stream<gpu>()->Wait();
+            on_complete();
+          },
+          this->ctx(),
+          const_vars,
+          {this->var()},
+          src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+          0,
+          "SyncCopyFromNDArrayGPU2GPU");
     } else {
       LOG(FATAL) << "unknown device mask";
     }
@@ -2021,20 +2213,20 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
   WaitToRead();
 }
 
-void NDArray::SyncCopyToCPU(void *data, size_t size) const {
+void NDArray::SyncCopyToCPU(void* data, size_t size) const {
   mxnet::TShape dshape = this->shape();
   if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
-    CHECK_LT(size, (int64_t{1} << 31) - 1) <<
-              "[SyncCopyToCPU] Size of tensor you are trying to allocate is larger than "
-              "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+    CHECK_LT(size, (int64_t{1} << 31) - 1)
+        << "[SyncCopyToCPU] Size of tensor you are trying to allocate is "
+           "larger than "
+           "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
   }
-  CHECK_EQ(dshape.Size(), size)
-      << "Memory size do not match";
+  CHECK_EQ(dshape.Size(), size) << "Memory size do not match";
   // zero-size array, no need to copy
   if (size == 0U) {
     return;
   }
-  TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
+  TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0);  // NOLINT(*)
 
   if (this->ctx().dev_mask() == cpu::kDevMask) {
     this->WaitToRead();
@@ -2044,19 +2236,22 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
     if (src.IsMKLDNNData())
       src = this->Reorder2Default();
 #endif
-    ndarray::Copy<cpu, cpu>(src.data(), &dst,
-                            Context::CPU(), Context::CPU(), rctx);
+    ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA
     Engine::Get()->PushAsync(
-      [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
-        ndarray::Copy<gpu, cpu>(this->data(), &dst,
-                                this->ctx(), Context::CPU(), rctx);
-        // Wait GPU kernel to complete
-        rctx.get_stream<gpu>()->Wait();
-        on_complete();
-      }, this->ctx(), {this->var()}, {},
-      FnProperty::kCopyFromGPU, 0, "SyncCopyGPU2CPU");
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          ndarray::Copy<gpu, cpu>(this->data(), &dst, this->ctx(), Context::CPU(), rctx);
+          // Wait GPU kernel to complete
+          rctx.get_stream<gpu>()->Wait();
+          on_complete();
+        },
+        this->ctx(),
+        {this->var()},
+        {},
+        FnProperty::kCopyFromGPU,
+        0,
+        "SyncCopyGPU2CPU");
     this->WaitToWrite();
 #else
     LOG(FATAL) << "GPU is not enabled";
@@ -2068,101 +2263,114 @@ void NDArray::SyncCheckFormat(const bool full_check) const {
   int32_t err = kNormalErr;
   TBlob err_cpu(&err, mshadow::Shape1(1), cpu::kDevMask, 0);
   if (this->ctx().dev_mask() == cpu::kDevMask) {
-    Engine::Get()->PushSync([&](RunContext rctx) {
-        common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check);
-      }, this->ctx(), {this->var()}, {},
-      FnProperty::kNormal, 0, "CheckFormat");
+    Engine::Get()->PushSync(
+        [&](RunContext rctx) { common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check); },
+        this->ctx(),
+        {this->var()},
+        {},
+        FnProperty::kNormal,
+        0,
+        "CheckFormat");
   } else {
 #if MXNET_USE_CUDA
-    Engine::Get()->PushSync([&](RunContext rctx) {
-        common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
-        rctx.get_stream<gpu>()->Wait();
-      }, this->ctx(), {this->var()}, {},
-      FnProperty::kNormal, 0, "CheckFormat");
+    Engine::Get()->PushSync(
+        [&](RunContext rctx) {
+          common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
+          rctx.get_stream<gpu>()->Wait();
+        },
+        this->ctx(),
+        {this->var()},
+        {},
+        FnProperty::kNormal,
+        0,
+        "CheckFormat");
 #else
     LOG(FATAL) << "GPU is not enabled";
 #endif
   }
   this->WaitToWrite();
   CHECK_NE(err, kCSRShapeErr) << "Shape mismatch of this csr NDArray";
-  CHECK_NE(err, kCSRIndPtrErr)
-           << "IndPtr of csr NDArray should be non-negative, in non-decreasing order, "
-           << "start with 0, and end with value equal with size of indices.";
-  CHECK_NE(err, kCSRIdxErr)
-           << "Indices of csr NDArray should be non-negative, in ascending order per row "
-           << " and less than the number of columns.";
+  CHECK_NE(err, kCSRIndPtrErr) << "IndPtr of csr NDArray should be non-negative, in non-decreasing "
+                                  "order, "
+                               << "start with 0, and end with value equal with size of indices.";
+  CHECK_NE(err, kCSRIdxErr) << "Indices of csr NDArray should be non-negative, "
+                               "in ascending order per row "
+                            << " and less than the number of columns.";
   CHECK_NE(err, kRSPShapeErr) << "Shape mismatch of this row_sparse NDArray";
-  CHECK_NE(err, kRSPIdxErr)
-          << "Indices of row_sparse NDArray should be non-negative, "
-          << "less than the size of first dimension and in ascending order";
+  CHECK_NE(err, kRSPIdxErr) << "Indices of row_sparse NDArray should be non-negative, "
+                            << "less than the size of first dimension and in ascending order";
   CHECK_EQ(err, kNormalErr) << "Check the validity of this sparse NDArray";
 }
 
 #if MXNET_PREDICT_ONLY == 0
 // register API function
 // those with underscore will be registered at NDArray
-MXNET_REGISTER_NDARRAY_FUN(_set_value)
-.set_function(SetValueOp);
-
-
-MXNET_REGISTER_NDARRAY_FUN(_onehot_encode)
-.set_function(BinaryOp<ndarray::OneHotEncode>);
+MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);
 
+MXNET_REGISTER_NDARRAY_FUN(_onehot_encode).set_function(BinaryOp<ndarray::OneHotEncode>);
 
 MXNET_REGISTER_NDARRAY_FUN(fill_element_0index)
-.set_function(TernaryOp<ndarray::MatFillRowElem>)
-.describe("Fill one element of each line(row for python, column for R/Julia)"
-" in lhs according to index indicated by rhs and values indicated by mhs."
-" This function assume rhs uses 0-based index.");
+    .set_function(TernaryOp<ndarray::MatFillRowElem>)
+    .describe(
+        "Fill one element of each line(row for python, column for R/Julia)"
+        " in lhs according to index indicated by rhs and values indicated by "
+        "mhs."
+        " This function assume rhs uses 0-based index.");
 
 // register API function
 // those with underscore will be registered at NDArray
 
-void CopyFromToSimple(
-    const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx,
-    const std::vector<NDArray>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<NDArray>& outputs) {
+void CopyFromToSimple(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs) {
   CopyFromTo(inputs[0], outputs[0], 0, true);
 }
 
 // copy function is special
 // that we need to remove kAcceptEmptyMutateTarget from it
 NNVM_REGISTER_OP(_copyto)
-.add_alias("_npi_copyto")
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr<mxnet::FInferShape>("FInferShape", op::ElemwiseShape<1, 1>)
-.set_attr<nnvm::FInferType>("FInferType",
-  [](const NodeAttrs& attrs, std::vector<int> *in_type, std::vector<int> *out_type) {
-    return !op::type_is_none((*in_type)[0]) && !op::type_is_none((*out_type)[0]);
-  })
-.set_attr<FInferStorageType>("FInferStorageType",
-  [](const NodeAttrs& attrs,
-     const int dev_mask,
-     DispatchMode* dispatch_mode,
-     std::vector<int>* in_attrs,
-     std::vector<int>* out_attrs) {
-    op::dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx);
-    if (op::storage_type_is_none((*out_attrs)[0])) {
-      (*out_attrs)[0] = (*in_attrs)[0];
-    }
-    return true;
-  })
-.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
-    return ExecType::kCrossDeviceCopy;
-  })
-.set_attr<nnvm::FGradient>("FGradient", op::ElemwiseGradUseNone{"_copyto"})
-.set_attr<bool>("TIsBackward", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", CopyFromToSimple)
-.set_attr<FComputeEx>("FComputeEx<gpu>", CopyFromToSimple)
-.add_argument("data", "NDArray", "input data");
-
-
-void Imdecode(NDArray *ret, NDArray mean, size_t index,
-              size_t x0, size_t y0, size_t x1, size_t y1, size_t n_channels,
-              size_t size, char *str_img) {
+    .add_alias("_npi_copyto")
+    .set_num_inputs(1)
+    .set_num_outputs(1)
+    .set_attr<mxnet::FInferShape>("FInferShape", op::ElemwiseShape<1, 1>)
+    .set_attr<nnvm::FInferType>(
+        "FInferType",
+        [](const NodeAttrs& attrs, std::vector<int>* in_type, std::vector<int>* out_type) {
+          return !op::type_is_none((*in_type)[0]) && !op::type_is_none((*out_type)[0]);
+        })
+    .set_attr<FInferStorageType>("FInferStorageType",
+                                 [](const NodeAttrs& attrs,
+                                    const int dev_mask,
+                                    DispatchMode* dispatch_mode,
+                                    std::vector<int>* in_attrs,
+                                    std::vector<int>* out_attrs) {
+                                   op::dispatch_mode_assign(dispatch_mode,
+                                                            DispatchMode::kFComputeEx);
+                                   if (op::storage_type_is_none((*out_attrs)[0])) {
+                                     (*out_attrs)[0] = (*in_attrs)[0];
+                                   }
+                                   return true;
+                                 })
+    .set_attr<FExecType>("FExecType",
+                         [](const NodeAttrs& attrs) { return ExecType::kCrossDeviceCopy; })
+    .set_attr<nnvm::FGradient>("FGradient", op::ElemwiseGradUseNone{"_copyto"})
+    .set_attr<bool>("TIsBackward", true)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", CopyFromToSimple)
+    .set_attr<FComputeEx>("FComputeEx<gpu>", CopyFromToSimple)
+    .add_argument("data", "NDArray", "input data");
+
+void Imdecode(NDArray* ret,
+              NDArray mean,
+              size_t index,
+              size_t x0,
+              size_t y0,
+              size_t x1,
+              size_t y1,
+              size_t n_channels,
+              size_t size,
+              char* str_img) {
 #if MXNET_USE_OPENCV
   cv::Mat buf(1, size, CV_8U, str_img);
   cv::Mat res = cv::imdecode(buf, n_channels == 1 ? 0 : -1);
@@ -2174,12 +2382,12 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     y0 = 0;
     y1 = res.rows;
   }
-  CHECK(x1 <= static_cast<size_t>(res.cols) &&
-        y1 <= static_cast<size_t>(res.rows));
+  CHECK(x1 <= static_cast<size_t>(res.cols) && y1 <= static_cast<size_t>(res.rows));
 
   if (ret->is_none()) {
-    *ret = NDArray(mshadow::Shape3(n_channels, y1-y0, x1-x0),
-                   Context::CPU(), false,
+    *ret = NDArray(mshadow::Shape3(n_channels, y1 - y0, x1 - x0),
+                   Context::CPU(),
+                   false,
                    mean.is_none() ? mshadow::default_type_flag : mean.dtype());
   }
   NDArray buff;
@@ -2187,19 +2395,19 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     buff = ret->Reshape(mshadow::Shape4(1, ret->shape()[0], ret->shape()[1], ret->shape()[2]));
   } else {
     CHECK_EQ(ret->shape().ndim(), 4U);
-    buff = ret->Slice(index, index+1);
+    buff = ret->Slice(index, index + 1);
   }
   CHECK_EQ(buff.ctx().dev_mask(), Context::kCPU);
   CHECK_EQ(n_channels, buff.shape()[1]);
-  CHECK_EQ(y1-y0, buff.shape()[2]);
-  CHECK_EQ(x1-x0, buff.shape()[3]);
+  CHECK_EQ(y1 - y0, buff.shape()[2]);
+  CHECK_EQ(x1 - x0, buff.shape()[3]);
   buff.WaitToWrite();
   if (mean.is_none()) {
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
-      for (size_t i = 0; i < y1-y0; i++) {
-        uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
-        for (size_t j = 0; j < x1-x0; j++) {
+      for (size_t i = 0; i < y1 - y0; i++) {
+        uchar* im_data = res.ptr<uchar>(y0 + i) + res.channels() * x0;
+        for (size_t j = 0; j < x1 - x0; j++) {
           for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]);  // NOLINT(*)
           }
@@ -2216,10 +2424,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     mean.WaitToRead();
     MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
       mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
-      mshadow::Tensor<cpu, 3, DType> tmean = mean.data().get<cpu, 3, DType>();
-      for (size_t i = 0; i < y1-y0; i++) {
-        uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
-        for (size_t j = 0; j < x1-x0; j++) {
+      mshadow::Tensor<cpu, 3, DType> tmean  = mean.data().get<cpu, 3, DType>();
+      for (size_t i = 0; i < y1 - y0; i++) {
+        uchar* im_data = res.ptr<uchar>(y0 + i) + res.channels() * x0;
+        for (size_t j = 0; j < x1 - x0; j++) {
           for (size_t k = 0; k < n_channels; k++) {
             tensor[0][k][i][j] = DType(im_data[k]) - tmean[k][i][j];  // NOLINT(*)
           }
@@ -2234,31 +2442,38 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
 }
 
 MXNET_REGISTER_NDARRAY_FUN(_imdecode)
-.set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
-.set_body([](NDArray **u, real_t *s, NDArray **out,
-             int num_params, char **param_keys, char **param_vals) {
-    CHECK_EQ(num_params, 1);
-    Imdecode(out[0], *u[0],
-             static_cast<size_t>(s[0]),
-             static_cast<size_t>(s[1]),
-             static_cast<size_t>(s[2]),
-             static_cast<size_t>(s[3]),
-             static_cast<size_t>(s[4]),
-             static_cast<size_t>(s[5]),
-             static_cast<size_t>(s[6]),
-             param_vals[0]);
-  })
-.set_num_use_vars(1)
-.set_num_scalars(7)
-.set_num_mutate_vars(1)
-.describe("Decode an image, clip to (x0, y0, x1, y1), subtract mean, and write to buffer")
-.add_argument("mean", "NDArray-or-Symbol", "image mean")
-.add_argument("index", "int", "buffer position for output")
-.add_argument("x0", "int", "x0")
-.add_argument("y0", "int", "y0")
-.add_argument("x1", "int", "x1")
-.add_argument("y1", "int", "y1")
-.add_argument("c", "int", "channel")
-.add_argument("size", "int", "length of str_img");
+    .set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
+    .set_body([](NDArray** u,
+                 real_t* s,
+                 NDArray** out,
+                 int num_params,
+                 char** param_keys,
+                 char** param_vals) {
+      CHECK_EQ(num_params, 1);
+      Imdecode(out[0],
+               *u[0],
+               static_cast<size_t>(s[0]),
+               static_cast<size_t>(s[1]),
+               static_cast<size_t>(s[2]),
+               static_cast<size_t>(s[3]),
+               static_cast<size_t>(s[4]),
+               static_cast<size_t>(s[5]),
+               static_cast<size_t>(s[6]),
+               param_vals[0]);
+    })
+    .set_num_use_vars(1)
+    .set_num_scalars(7)
+    .set_num_mutate_vars(1)
+    .describe(
+        "Decode an image, clip to (x0, y0, x1, y1), subtract mean, and write "
+        "to buffer")
+    .add_argument("mean", "NDArray-or-Symbol", "image mean")
+    .add_argument("index", "int", "buffer position for output")
+    .add_argument("x0", "int", "x0")
+    .add_argument("y0", "int", "y0")
+    .add_argument("x1", "int", "x1")
+    .add_argument("y1", "int", "y1")
+    .add_argument("c", "int", "channel")
+    .add_argument("size", "int", "length of str_img");
 #endif
 }  // namespace mxnet
diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h
index 485b3b3..dae66bc 100644
--- a/src/operator/nn/batch_norm-inl.h
+++ b/src/operator/nn/batch_norm-inl.h
@@ -43,15 +43,24 @@
 #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 #endif
 
+/*! \brief inverse standard deviation <-> variance */
+#define VARIANCE_TO_INVSTD(__var$, __eps$)    (1.0 / std::sqrt((__var$) + DType(__eps$)))
+#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
+
 namespace mxnet {
 namespace op {
 
 namespace batchnorm {
-enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
-  kInMovingVar};  // kGamma: weights, kBeta: biases
-enum BatchNormOpOutputs {kOut, kMean, kVar};  // req, out_data
-enum BatchNormOpResource {kTempSpace};
-enum BatchNormOpAuxiliary {kMovingMean, kMovingVar};  // aux_states
+enum BatchNormOpInputs {
+  kData,
+  kGamma,
+  kBeta,
+  kInMovingMean,
+  kInMovingVar
+};                                              // kGamma: weights, kBeta: biases
+enum BatchNormOpOutputs { kOut, kMean, kVar };  // req, out_data
+enum BatchNormOpResource { kTempSpace };
+enum BatchNormOpAuxiliary { kMovingMean, kMovingVar };  // aux_states
 
 /*! \brief Default channel axis if none specified in the params */
 constexpr int DEFAULT_AXIS = 1;
@@ -59,11 +68,18 @@ constexpr int DEFAULT_AXIS = 1;
 
 /*! \brief Parameters for BatchNorm operator */
 namespace quantized_batchnorm {
-enum QuantizedBatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
-  kInMovingVar, kDataMin, kDataMax};
-enum QuantizedBatchNormOutputs {kOut, kOutMin, kOutMax};
-enum QuantizedBatchNormOpAuxiliary {kMovingMean, kMovingVar};
-}  // quantized_batchnorm
+enum QuantizedBatchNormOpInputs {
+  kData,
+  kGamma,
+  kBeta,
+  kInMovingMean,
+  kInMovingVar,
+  kDataMin,
+  kDataMax
+};
+enum QuantizedBatchNormOutputs { kOut, kOutMin, kOutMax };
+enum QuantizedBatchNormOpAuxiliary { kMovingMean, kMovingVar };
+}  // namespace quantized_batchnorm
 
 /*! \brief Parameters for BatchNoram operator */
 struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
@@ -79,38 +95,42 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
   dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
 
   DMLC_DECLARE_PARAMETER(BatchNormParam) {
-    DMLC_DECLARE_FIELD(eps).set_default(1e-3f)
-    .describe("Epsilon to prevent div 0. "
-              "Must be no less than CUDNN_BN_MIN_EPSILON "
-              "defined in cudnn.h when using cudnn (usually 1e-5)");
-    DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
-    .describe("Momentum for moving average");
-    DMLC_DECLARE_FIELD(fix_gamma).set_default(true)
-    .describe("Fix gamma while training");
-    DMLC_DECLARE_FIELD(use_global_stats).set_default(false)
-    .describe("Whether use global moving statistics instead of local batch-norm. "
-              "This will force change batch-norm into a scale shift operator.");
-    DMLC_DECLARE_FIELD(output_mean_var).set_default(false)
-    .describe("Output the mean and inverse std ");
-    DMLC_DECLARE_FIELD(axis).set_default(mxnet::op::batchnorm::DEFAULT_AXIS)
-    .describe("Specify which shape axis the channel is specified");
-    DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
-    .describe("Do not select CUDNN operator, if available");
+    DMLC_DECLARE_FIELD(eps).set_default(1e-3f).describe(
+        "Epsilon to prevent div 0. "
+        "Must be no less than CUDNN_BN_MIN_EPSILON "
+        "defined in cudnn.h when using cudnn (usually 1e-5)");
+    DMLC_DECLARE_FIELD(momentum).set_default(0.9f).describe("Momentum for moving average");
+    DMLC_DECLARE_FIELD(fix_gamma).set_default(true).describe("Fix gamma while training");
+    DMLC_DECLARE_FIELD(use_global_stats)
+        .set_default(false)
+        .describe(
+            "Whether use global moving statistics instead of local batch-norm. "
+            "This will force change batch-norm into a scale shift operator.");
+    DMLC_DECLARE_FIELD(output_mean_var)
+        .set_default(false)
+        .describe("Output the mean and inverse std ");
+    DMLC_DECLARE_FIELD(axis)
+        .set_default(mxnet::op::batchnorm::DEFAULT_AXIS)
+        .describe("Specify which shape axis the channel is specified");
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(false).describe(
+        "Do not select CUDNN operator, if available");
     DMLC_DECLARE_FIELD(min_calib_range)
-    .set_default(dmlc::optional<float>())
-    .describe("The minimum scalar value in the form of float32 obtained "
-              "through calibration. If present, it will be used to by "
-              "quantized batch norm op to calculate primitive scale."
-              "Note: this calib_range is to calib bn output.");
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The minimum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized batch norm op to calculate primitive scale."
+            "Note: this calib_range is to calib bn output.");
     DMLC_DECLARE_FIELD(max_calib_range)
-    .set_default(dmlc::optional<float>())
-    .describe("The maximum scalar value in the form of float32 obtained "
-              "through calibration. If present, it will be used to by "
-              "quantized batch norm op to calculate primitive scale."
-              "Note: this calib_range is to calib bn output.");
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The maximum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized batch norm op to calculate primitive scale."
+            "Note: this calib_range is to calib bn output.");
   }
 
-  bool operator==(const BatchNormParam &other) const {
+  bool operator==(const BatchNormParam& other) const {
     bool flag = this->eps == other.eps && this->momentum == other.momentum &&
                 this->fix_gamma == other.fix_gamma &&
                 this->use_global_stats == other.use_global_stats &&
@@ -131,15 +151,15 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
 }  // namespace mxnet
 
 namespace std {
-template<>
+template <>
 struct hash<mxnet::op::BatchNormParam> {
   size_t operator()(const mxnet::op::BatchNormParam& val) {
     size_t ret = 0;
-    ret = dmlc::HashCombine(ret, val.momentum);
-    ret = dmlc::HashCombine(ret, val.fix_gamma);
-    ret = dmlc::HashCombine(ret, val.use_global_stats);
-    ret = dmlc::HashCombine(ret, val.output_mean_var);
-    ret = dmlc::HashCombine(ret, val.axis);
+    ret        = dmlc::HashCombine(ret, val.momentum);
+    ret        = dmlc::HashCombine(ret, val.fix_gamma);
+    ret        = dmlc::HashCombine(ret, val.use_global_stats);
+    ret        = dmlc::HashCombine(ret, val.output_mean_var);
+    ret        = dmlc::HashCombine(ret, val.axis);
     return ret;
   }
 };
@@ -153,40 +173,30 @@ static inline bool IsBNWriting(const OpReqType ort) {
 }
 
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<cpu> *stream,
-                          const OpContext &ctx, const BatchNormParam& param,
-                          const std::vector<TBlob> &in_data,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<TBlob> &out_data,
-                          const std::vector<TBlob> &aux_states);
+void BatchNormForwardImpl(mshadow::Stream<cpu>* stream, const OpContext& ctx,
+                          const BatchNormParam& param, const std::vector<TBlob>& in_data,
+                          const std::vector<OpReqType>& req, const std::vector<TBlob>& out_data,
+                          const std::vector<TBlob>& aux_states);
 
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<cpu> *stream,
-                           const OpContext &ctx, const BatchNormParam& param,
-                           const std::vector<TBlob> &out_grad,
-                           const std::vector<TBlob> &in_data,
-                           const std::vector<TBlob> &out_data,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &in_grad,
-                           const std::vector<TBlob> &aux_states);
+void BatchNormBackwardImpl(mshadow::Stream<cpu>* stream, const OpContext& ctx,
+                           const BatchNormParam& param, const std::vector<TBlob>& out_grad,
+                           const std::vector<TBlob>& in_data, const std::vector<TBlob>& out_data,
+                           const std::vector<OpReqType>& req, const std::vector<TBlob>& in_grad,
+                           const std::vector<TBlob>& aux_states);
 
 #if MXNET_USE_CUDA
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
-                          const OpContext &ctx, const BatchNormParam& param,
-                          const std::vector<TBlob> &in_data,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<TBlob> &out_data,
-                          const std::vector<TBlob> &aux_states);
+void BatchNormForwardImpl(mshadow::Stream<gpu>* stream, const OpContext& ctx,
+                          const BatchNormParam& param, const std::vector<TBlob>& in_data,
+                          const std::vector<OpReqType>& req, const std::vector<TBlob>& out_data,
+                          const std::vector<TBlob>& aux_states);
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
-                           const OpContext &ctx, const BatchNormParam& param,
-                           const std::vector<TBlob> &out_grad,
-                           const std::vector<TBlob> &in_data,
-                           const std::vector<TBlob> &out_data,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &in_grad,
-                           const std::vector<TBlob> &aux_states);
+void BatchNormBackwardImpl(mshadow::Stream<gpu>* stream, const OpContext& ctx,
+                           const BatchNormParam& param, const std::vector<TBlob>& out_grad,
+                           const std::vector<TBlob>& in_data, const std::vector<TBlob>& out_data,
+                           const std::vector<OpReqType>& req, const std::vector<TBlob>& in_grad,
+                           const std::vector<TBlob>& aux_states);
 #endif  // MXNET_USE_CUDA
 
 /*!
@@ -201,11 +211,9 @@ void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
  * \sa OpReqType, OpContext
  */
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
-                      const std::vector<TBlob> &in_data,
-                      const std::vector<OpReqType> &req,
-                      const std::vector<TBlob> &out_data,
-                      const std::vector<TBlob> &aux_states) {
+void BatchNormForward(const OpContext& ctx, const BatchNormParam& param,
+                      const std::vector<TBlob>& in_data, const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& out_data, const std::vector<TBlob>& aux_states) {
   using namespace mshadow;
   using namespace mshadow::expr;
 
@@ -219,9 +227,8 @@ void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
     CHECK_GE(req.size(), 1U);
     CHECK_EQ(req[batchnorm::kOut], kWriteTo);
   }
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req,
-                                            out_data, aux_states);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req, out_data, aux_states);
 }
 
 /*!
@@ -253,10 +260,9 @@ void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
  * \sa OperatorProperty, OpReqType, OpContext
  */
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
-                       const std::vector<TBlob> &inputs,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs) {
+void BatchNormBackward(const OpContext& ctx, const BatchNormParam& param,
+                       const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 8U);
   CHECK_EQ(outputs.size(), 3U);
 
@@ -265,41 +271,36 @@ void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
   std::vector<TBlob> in_data(3);
   std::vector<TBlob> aux_states(2);
 
-  out_grad[0] = inputs[0];
-  out_data[batchnorm::kMean] = inputs[1];
-  out_data[batchnorm::kVar] = inputs[2];
-  in_data[batchnorm::kData] = inputs[3];
-  in_data[batchnorm::kGamma] = inputs[4];
-  in_data[batchnorm::kBeta] = inputs[5];
+  out_grad[0]                        = inputs[0];
+  out_data[batchnorm::kMean]         = inputs[1];
+  out_data[batchnorm::kVar]          = inputs[2];
+  in_data[batchnorm::kData]          = inputs[3];
+  in_data[batchnorm::kGamma]         = inputs[4];
+  in_data[batchnorm::kBeta]          = inputs[5];
   aux_states[batchnorm::kMovingMean] = inputs[6];
-  aux_states[batchnorm::kMovingVar] = inputs[7];
-  const std::vector<TBlob> &in_grad = outputs;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data,
-                                             out_data, req, in_grad, aux_states);
+  aux_states[batchnorm::kMovingVar]  = inputs[7];
+  const std::vector<TBlob>& in_grad  = outputs;
+  mshadow::Stream<xpu>* s            = ctx.get_stream<xpu>();
+  BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data, out_data, req,
+                                             in_grad, aux_states);
 }
 
-template<typename xpu>
-void BatchNormCompute(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx, const std::vector<TBlob>& inputs,
-                      const std::vector<OpReqType>& req,
+template <typename xpu>
+void BatchNormCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                      const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 5U);
-  std::vector<TBlob> in_data(inputs.begin(),
-                             inputs.begin() + batchnorm::kInMovingMean);
-  std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean,
-                                inputs.end());
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
+  std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
   MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
-    BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs,
-                                          aux_states);
+    BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
   });
 }
 
-template<typename xpu>
-void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx, const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
+template <typename xpu>
+void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                          const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
                           const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 8U);
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
@@ -313,15 +314,15 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
 
 namespace batchnorm {
 
-template<typename DType>
+template <typename DType>
 class BNTensor3 {
   enum { OUTER, CHANNEL, INNER, COUNT };
 
  public:
   inline BNTensor3(const TBlob& blob, const int indexOfChannel)
-    : dptr_(blob.dptr<DType>())
-      , indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
-                               ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
+      : dptr_(blob.dptr<DType>()),
+        indexOfChannel_(static_cast<size_t>(
+            indexOfChannel < 0 ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
                                : indexOfChannel)) {
     CHECK_EQ(blob.type_flag_, mshadow::DataType<DType>::kFlag);
     shape_[OUTER] = 1;
@@ -329,31 +330,29 @@ class BNTensor3 {
       shape_[OUTER] *= blob.shape_[i];
     }
     shape_[CHANNEL] = blob.shape_[indexOfChannel_];
-    shape_[INNER] = 1;
+    shape_[INNER]   = 1;
     for (size_t i = indexOfChannel_ + 1, n = blob.shape_.ndim(); i < n; ++i) {
       shape_[INNER] *= blob.shape_[i];
     }
   }
 
-  inline BNTensor3(DType *p, const mxnet::TShape& shape, const int indexOfChannel)
-    : dptr_(p)
-      , indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
-                               ? (static_cast<int>(shape.ndim()) + indexOfChannel)
-                               : indexOfChannel)) {
+  inline BNTensor3(DType* p, const mxnet::TShape& shape, const int indexOfChannel)
+      : dptr_(p),
+        indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
+                                                ? (static_cast<int>(shape.ndim()) + indexOfChannel)
+                                                : indexOfChannel)) {
     shape_[OUTER] = 1;
     for (size_t i = 0; i < indexOfChannel_; ++i) {
       shape_[OUTER] *= shape[i];
     }
     shape_[CHANNEL] = shape[indexOfChannel_];
-    shape_[INNER] = 1;
+    shape_[INNER]   = 1;
     for (size_t i = indexOfChannel_ + 1, n = shape.ndim(); i < n; ++i) {
       shape_[INNER] *= shape[i];
     }
   }
 
-  MSHADOW_FORCE_INLINE bool IsEmpty() const {
-    return dptr_ == nullptr;
-  }
+  MSHADOW_FORCE_INLINE bool IsEmpty() const { return dptr_ == nullptr; }
 
   MSHADOW_XINLINE size_t Size() const {
     size_t n = 1;
@@ -363,22 +362,14 @@ class BNTensor3 {
     return n;
   }
 
-  MSHADOW_XINLINE size_t ChannelCount() const {
-    return shape_[CHANNEL];
-  }
+  MSHADOW_XINLINE size_t ChannelCount() const { return shape_[CHANNEL]; }
 
-  MSHADOW_XINLINE size_t OuterSize() const {
-    return shape_[OUTER];
-  }
+  MSHADOW_XINLINE size_t OuterSize() const { return shape_[OUTER]; }
 
-  MSHADOW_XINLINE size_t InnerSize() const {
-    return shape_[INNER];
-  }
+  MSHADOW_XINLINE size_t InnerSize() const { return shape_[INNER]; }
 
   /*! \brief start of a given channel's spatial data */
-  MSHADOW_XINLINE size_t StartOffset(const size_t channel) const {
-    return channel * InnerSize();
-  }
+  MSHADOW_XINLINE size_t StartOffset(const size_t channel) const { return channel * InnerSize(); }
 
   /*! \brief This is the amount to skip to next same-channel data
    * This is the number of bytes to skip from one past the end of the current spatial data
@@ -392,12 +383,10 @@ class BNTensor3 {
     return (ChannelCount() - 1) * InnerSize();
   }
 
-  MSHADOW_XINLINE size_t offset(const size_t outer,
-                                const size_t channel,
-                                const size_t i) const {
+  MSHADOW_XINLINE size_t offset(const size_t outer, const size_t channel, const size_t i) const {
     const size_t spatial_size = InnerSize();
-    const size_t skip_length = SkipLengthToNextSameChannelData();
-    size_t off = StartOffset(channel);
+    const size_t skip_length  = SkipLengthToNextSameChannelData();
+    size_t off                = StartOffset(channel);
     off += outer * shape_[CHANNEL] * shape_[INNER];
     const size_t skips = i / spatial_size;
     off += (1 + skip_length) * skips;
@@ -405,21 +394,18 @@ class BNTensor3 {
     return off;
   }
 
-  MSHADOW_XINLINE DType& get_ref(const size_t batch,
-                                 const size_t channel,
-                                 const size_t i) {
+  MSHADOW_XINLINE DType& get_ref(const size_t batch, const size_t channel, const size_t i) {
     const size_t off = offset(batch, channel, i);
     return dptr_[off];
   }
 
-  MSHADOW_XINLINE const DType& get_ref(const size_t batch,
-                                       const size_t channel,
+  MSHADOW_XINLINE const DType& get_ref(const size_t batch, const size_t channel,
                                        const size_t i) const {
     const size_t off = offset(batch, channel, i);
     return dptr_[off];
   }
 
-  DType *dptr_;
+  DType* dptr_;
   size_t indexOfChannel_;
   size_t shape_[COUNT];
 };
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 1bbdfa6..6ffbc66 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -22,20 +22,17 @@
  * \file batch_norm.cc
  * \brief
  * \author Bing Xu, Chris Olivier, Da Zheng
-*/
+ */
 
-#include "batch_norm-inl.h"
 #include <nnvm/op_attr_types.h>
+
 #include "../elemwise_op_common.h"
 #include "../operator_common.h"
+#include "batch_norm-inl.h"
 #if MXNET_USE_MKLDNN == 1
 #include "./mkldnn/mkldnn_batch_norm-inl.h"
 #endif
 
-/*! \brief inverse standard deviation <-> variance */
-#define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/std::sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$)   ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
-
 namespace mxnet {
 namespace op {
 namespace batchnorm {
@@ -43,16 +40,17 @@ namespace batchnorm {
 /*! \brief Global disable of batchnorm mkl operator for unit testing */
 volatile bool disable_mkl = false;
 
-/*! \brief Fast-foreach when you don't care about the position other than channel */
-template<typename DType, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType> &tensor,
+/*! \brief Fast-foreach when you don't care about the position other than
+ * channel */
+template <typename DType, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType>& tensor,
                                const size_t channel,
                                OnData onData) {
-  const size_t num        = tensor.OuterSize();
-  const size_t matrixSize = tensor.InnerSize();
-  const size_t skipLength = tensor.SkipLengthToNextSameChannelData();
+  const size_t num         = tensor.OuterSize();
+  const size_t matrixSize  = tensor.InnerSize();
+  const size_t skipLength  = tensor.SkipLengthToNextSameChannelData();
   const size_t startOffset = tensor.StartOffset(channel);
-  DType *data = tensor.dptr_ + startOffset;
+  DType* data              = tensor.dptr_ + startOffset;
 
   for (size_t outer = 0; outer < num; ++outer) {
     for (size_t i = 0; i < matrixSize; ++i) {
@@ -62,10 +60,11 @@ static inline void ForEachFast(const BNTensor3<DType> &tensor,
   }
 }
 
-/*! \brief Fast-foreach when you don't care about the position other than channel */
-template<typename DType1, typename DType2, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType1> &in_data,
-                               const BNTensor3<DType2> &out_data,
+/*! \brief Fast-foreach when you don't care about the position other than
+ * channel */
+template <typename DType1, typename DType2, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType1>& in_data,
+                               const BNTensor3<DType2>& out_data,
                                const size_t channel,
                                OnData onData) {
   const size_t num         = in_data.OuterSize();
@@ -73,22 +72,22 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
   const size_t skipLength  = in_data.SkipLengthToNextSameChannelData();
   const size_t startOffset = in_data.StartOffset(channel);
 
-  DType1  *data = in_data.dptr_  + startOffset;
-  DType2 *odata = out_data.dptr_ + startOffset;
+  DType1* data  = in_data.dptr_ + startOffset;
+  DType2* odata = out_data.dptr_ + startOffset;
 
   for (size_t outer = 0; outer < num; ++outer) {
     for (size_t i = 0; i < matrixSize; ++i) {
       onData(data++, odata++);
     }
-    data  += skipLength;
+    data += skipLength;
     odata += skipLength;
   }
 }
 
-template<typename DType1, typename DType2, typename DType3, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType1> &in_data,
-                               const BNTensor3<DType2> &in_data2,
-                               const BNTensor3<DType3> &out_data,
+template <typename DType1, typename DType2, typename DType3, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType1>& in_data,
+                               const BNTensor3<DType2>& in_data2,
+                               const BNTensor3<DType3>& out_data,
                                const size_t channel,
                                OnData onData) {
   const size_t num         = in_data.OuterSize();
@@ -96,15 +95,15 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
   const size_t skipLength  = in_data.SkipLengthToNextSameChannelData();
   const size_t startOffset = in_data.StartOffset(channel);
 
-  DType1 *data = in_data.dptr_  + startOffset;
-  DType2 *data2 = in_data2.dptr_  + startOffset;
-  DType3 *odata = out_data.dptr_ + startOffset;
+  DType1* data  = in_data.dptr_ + startOffset;
+  DType2* data2 = in_data2.dptr_ + startOffset;
+  DType3* odata = out_data.dptr_ + startOffset;
 
   for (size_t outer = 0; outer < num; ++outer) {
     for (size_t i = 0; i < matrixSize; ++i) {
       onData(data++, data2++, odata++);
     }
-    data  += skipLength;
+    data += skipLength;
     data2 += skipLength;
     odata += skipLength;
   }
@@ -114,50 +113,50 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
 
 /*! \brief Forward CPU */
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<cpu> *,
-                          const OpContext &ctx, const BatchNormParam& param_,
-                          const std::vector<TBlob> &in_data,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<TBlob> &out_data,
-                          const std::vector<TBlob> &aux_states) {
+void BatchNormForwardImpl(mshadow::Stream<cpu>*,
+                          const OpContext& ctx,
+                          const BatchNormParam& param_,
+                          const std::vector<TBlob>& in_data,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& out_data,
+                          const std::vector<TBlob>& aux_states) {
   // Input
   batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
-  const TBlob &weights         = in_data[batchnorm::kGamma];
-  const TBlob &bias            = in_data[batchnorm::kBeta];
+  const TBlob& weights = in_data[batchnorm::kGamma];
+  const TBlob& bias    = in_data[batchnorm::kBeta];
 
   // Aux (Moving)
-  const TBlob &runningMean     = aux_states[batchnorm::kMovingMean];
-  const TBlob &runningVariance = aux_states[batchnorm::kMovingVar];
+  const TBlob& runningMean     = aux_states[batchnorm::kMovingMean];
+  const TBlob& runningVariance = aux_states[batchnorm::kMovingVar];
 
   // Output
   batchnorm::BNTensor3<DType> outputData(out_data[batchnorm::kOut], param_.axis);
-  const TBlob &meanVector      = out_data[batchnorm::kMean];
-  const TBlob &varianceVector  = out_data[batchnorm::kVar];
+  const TBlob& meanVector     = out_data[batchnorm::kMean];
+  const TBlob& varianceVector = out_data[batchnorm::kVar];
 
-  AccReal *mean = meanVector.dptr<AccReal>();
-  AccReal  *var = varianceVector.dptr<AccReal>();
+  AccReal* mean = meanVector.dptr<AccReal>();
+  AccReal* var  = varianceVector.dptr<AccReal>();
 
   const bool is_train_and_not_global_stats = ctx.is_train && !param_.use_global_stats;
-  const size_t channelCount = inputData.ChannelCount();
-  const size_t itemCountPerChannel = inputData.Size() / channelCount;
+  const size_t channelCount                = inputData.ChannelCount();
+  const size_t itemCountPerChannel         = inputData.Size() / channelCount;
 
-  #pragma omp parallel for
+#pragma omp parallel for
   for (int channel = 0; channel < static_cast<int>(channelCount); ++channel) {
     if (is_train_and_not_global_stats) {
       // compute mean per input
       mean[channel] = 0;
-      ForEachFast(inputData, channel, [mean, channel](const DType *in_data) {
-        mean[channel] += *in_data; });
+      ForEachFast(
+          inputData, channel, [mean, channel](const DType* in_data) { mean[channel] += *in_data; });
       mean[channel] /= itemCountPerChannel;
 
       // compute variance per input
       const AccReal thisMean = mean[channel];
-      var[channel] = 0;
-      ForEachFast(inputData, channel,
-                  [var, thisMean, channel](const DType *current_in_data) {
-                    const AccReal current = *current_in_data;
-                    var[channel] += (current - thisMean) * (current - thisMean);
-                  });
+      var[channel]           = 0;
+      ForEachFast(inputData, channel, [var, thisMean, channel](const DType* current_in_data) {
+        const AccReal current = *current_in_data;
+        var[channel] += (current - thisMean) * (current - thisMean);
+      });
 
       const AccReal sum = var[channel];
 
@@ -167,125 +166,130 @@ void BatchNormForwardImpl(mshadow::Stream<cpu> *,
         invstd = 0;
       } else {
         const AccReal variance = sum / itemCountPerChannel;
-        invstd = VARIANCE_TO_INVSTD(variance, param_.eps);
+        invstd                 = VARIANCE_TO_INVSTD(variance, param_.eps);
       }
       var[channel] = invstd;
     } else {
-      const AccReal *rm = runningMean.dptr<AccReal>();
-      const AccReal *rv = runningVariance.dptr<AccReal>();
+      const AccReal* rm = runningMean.dptr<AccReal>();
+      const AccReal* rv = runningVariance.dptr<AccReal>();
 
       mean[channel] = rm[channel];
-      var[channel] = VARIANCE_TO_INVSTD(rv[channel], param_.eps);
+      var[channel]  = VARIANCE_TO_INVSTD(rv[channel], param_.eps);
     }
 
     // compute output
-    AccReal *w = weights.dptr<AccReal>();
-    const AccReal *b = bias.dptr<AccReal>();
+    AccReal* w       = weights.dptr<AccReal>();
+    const AccReal* b = bias.dptr<AccReal>();
 
-    const AccReal thisMean = mean[channel];
+    const AccReal thisMean   = mean[channel];
     const AccReal thisInvstd = var[channel];
     const AccReal thisWeight = w[channel];
-    const AccReal thisBias = b[channel];
+    const AccReal thisBias   = b[channel];
 
     // note that var is still invstd
     if (!param_.fix_gamma) {
       if (IsBNWriting(req[batchnorm::kData])) {
-        ForEachFast(inputData, outputData, channel,
-                    [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
-                                                                 DType *out_data) {
-                      *out_data = static_cast<DType>(
-                        ((*in_data - thisMean) * thisInvstd) * thisWeight + thisBias);
-                    });
+        ForEachFast(
+            inputData,
+            outputData,
+            channel,
+            [thisWeight, thisBias, thisMean, thisInvstd](const DType* in_data, DType* out_data) {
+              *out_data =
+                  static_cast<DType>(((*in_data - thisMean) * thisInvstd) * thisWeight + thisBias);
+            });
       }
     } else {
       if (IsBNWriting(req[batchnorm::kGamma])) {
         w[channel] = AccReal(1);
       }
       if (IsBNWriting(req[batchnorm::kData])) {
-        ForEachFast(inputData, outputData, channel,
-                    [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
-                                                                 DType *out_data) {
-                      *out_data = static_cast<DType>(
-                        ((*in_data - thisMean) * thisInvstd) + thisBias);
-                    });
+        ForEachFast(
+            inputData,
+            outputData,
+            channel,
+            [thisWeight, thisBias, thisMean, thisInvstd](const DType* in_data, DType* out_data) {
+              *out_data = static_cast<DType>(((*in_data - thisMean) * thisInvstd) + thisBias);
+            });
       }
     }
   }
 }
 
 template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
-                           const OpContext &ctx, const BatchNormParam& param_,
-                           const std::vector<TBlob> &out_grad,
-                           const std::vector<TBlob> &in_data,
-                           const std::vector<TBlob> &out_data,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<TBlob> &in_grad,
-                           const std::vector<TBlob> &aux_states) {
+void BatchNormBackwardImpl(mshadow::Stream<cpu>*,
+                           const OpContext& ctx,
+                           const BatchNormParam& param_,
+                           const std::vector<TBlob>& out_grad,
+                           const std::vector<TBlob>& in_data,
+                           const std::vector<TBlob>& out_data,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& in_grad,
+                           const std::vector<TBlob>& aux_states) {
   // Input Data
   batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
-  const TBlob &weights   = in_data[batchnorm::kGamma];
+  const TBlob& weights = in_data[batchnorm::kGamma];
 
   // Input Grad
   batchnorm::BNTensor3<DType> gradIn(in_grad[batchnorm::kData], param_.axis);
-  const TBlob &gradWeight = in_grad[batchnorm::kGamma];
-  const TBlob &gradBias   = in_grad[batchnorm::kBeta];
+  const TBlob& gradWeight = in_grad[batchnorm::kGamma];
+  const TBlob& gradBias   = in_grad[batchnorm::kBeta];
 
   // Aux (Moving)
-  const TBlob &runningMean = aux_states[batchnorm::kMovingMean];
-  const TBlob &runningVariance = aux_states[batchnorm::kMovingVar];
+  const TBlob& runningMean     = aux_states[batchnorm::kMovingMean];
+  const TBlob& runningVariance = aux_states[batchnorm::kMovingVar];
 
   // Output
   batchnorm::BNTensor3<DType> gradOut(out_grad[batchnorm::kOut], param_.axis);
-  const TBlob &saveMean = out_data[batchnorm::kMean];
-  const TBlob &saveStd  = out_data[batchnorm::kVar];
+  const TBlob& saveMean = out_data[batchnorm::kMean];
+  const TBlob& saveStd  = out_data[batchnorm::kVar];
 
   const size_t channelCount = inputData.ChannelCount();
   const size_t itemCount    = inputData.Size() / channelCount;
 
   // Avoid multiple dptr() call within the channel loop
-  AccReal *runningMeanDataPtr = runningMean.dptr<AccReal>();
-  AccReal *runningVarDataPtr  = runningVariance.dptr<AccReal>();
-  const AccReal *saveMeanDataPtr = saveMean.dptr<AccReal>();
-  const AccReal *saveInvStdDataPtr = saveStd.dptr<AccReal>();
-  AccReal *gradWeightData = gradWeight.dptr<AccReal>();
-  AccReal *gradBiasData = gradBias.dptr<AccReal>();
+  AccReal* runningMeanDataPtr      = runningMean.dptr<AccReal>();
+  AccReal* runningVarDataPtr       = runningVariance.dptr<AccReal>();
+  const AccReal* saveMeanDataPtr   = saveMean.dptr<AccReal>();
+  const AccReal* saveInvStdDataPtr = saveStd.dptr<AccReal>();
+  AccReal* gradWeightData          = gradWeight.dptr<AccReal>();
+  AccReal* gradBiasData            = gradBias.dptr<AccReal>();
 
   const bool is_train_and_not_global_stats = ctx.is_train && !param_.use_global_stats;
 
-  #pragma omp parallel for
+#pragma omp parallel for
   for (int channel = 0; channel < static_cast<int>(channelCount); ++channel) {
-    const AccReal *weight = weights.dptr<AccReal>();
-    const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1);
+    const AccReal* weight = weights.dptr<AccReal>();
+    const AccReal w       = !param_.fix_gamma ? weight[channel] : AccReal(1);
     AccReal mean, invstd;
     if (is_train_and_not_global_stats) {
-      mean = saveMeanDataPtr[channel];
-      invstd = saveInvStdDataPtr[channel];
+      mean                   = saveMeanDataPtr[channel];
+      invstd                 = saveInvStdDataPtr[channel];
       const AccReal variance = INVSTD_TO_VARIANCE(invstd, param_.eps);
 
       // update running averages
-      runningMeanDataPtr[channel] = runningMeanDataPtr[channel] * param_.momentum
-                                    + mean * (AccReal(1) - param_.momentum);
+      runningMeanDataPtr[channel] =
+          runningMeanDataPtr[channel] * param_.momentum + mean * (AccReal(1) - param_.momentum);
 
-      runningVarDataPtr[channel] = runningVarDataPtr[channel] * param_.momentum
-                                   + variance * (AccReal(1) - param_.momentum);
+      runningVarDataPtr[channel] =
+          runningVarDataPtr[channel] * param_.momentum + variance * (AccReal(1) - param_.momentum);
 
     } else {
-      mean = runningMeanDataPtr[channel];
+      mean   = runningMeanDataPtr[channel];
       invstd = VARIANCE_TO_INVSTD(runningVarDataPtr[channel], param_.eps);
     }
 
     // sumGradOut over all gradOutput in feature plane
     AccReal sumGradOut = 0;
-    ForEachFast(gradOut, static_cast<size_t>(channel),
-                [&sumGradOut](const DType *gradOut_data) {
-                  sumGradOut += *gradOut_data;
-                });
+    ForEachFast(gradOut, static_cast<size_t>(channel), [&sumGradOut](const DType* gradOut_data) {
+      sumGradOut += *gradOut_data;
+    });
 
     // dot product of the Q(X) and gradOuput
     AccReal dotp = 0;
-    ForEachFast(inputData, gradOut, static_cast<size_t>(channel),
-                [&dotp, mean](const DType *thisInputData, const DType *gradOut_data) {
+    ForEachFast(inputData,
+                gradOut,
+                static_cast<size_t>(channel),
+                [&dotp, mean](const DType* thisInputData, const DType* gradOut_data) {
                   dotp += (*thisInputData - mean) * (*gradOut_data);
                 });
 
@@ -297,28 +301,34 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
         // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
 
         // projection of gradOutput on to output scaled by std
-        const AccReal k = dotp * invstd * invstd / itemCount;
-        const AccReal iw = invstd * w;
+        const AccReal k        = dotp * invstd * invstd / itemCount;
+        const AccReal iw       = invstd * w;
         const AccReal gradMean = sumGradOut / itemCount;
         if (req[batchnorm::kData] != kAddTo) {
-          ForEachFast(inputData, gradIn, static_cast<size_t>(channel),
-                      [&mean, &k](const DType *inputDataPtr, DType *gradIn_data) {
+          ForEachFast(inputData,
+                      gradIn,
+                      static_cast<size_t>(channel),
+                      [&mean, &k](const DType* inputDataPtr, DType* gradIn_data) {
                         *gradIn_data = (*inputDataPtr - mean) * k;
                       });
 
-          ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
-                      [iw, gradMean](const DType *gradOut_data, DType *gradIn_data) {
+          ForEachFast(gradOut,
+                      gradIn,
+                      static_cast<size_t>(channel),
+                      [iw, gradMean](const DType* gradOut_data, DType* gradIn_data) {
                         *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * iw;
                       });
         } else {
-          ForEachFast(inputData, gradOut, gradIn, static_cast<size_t>(channel),
-                      [&mean, &k, iw, gradMean](const DType *inputDataPtr,
-                                                const DType *gradOut_data,
-                                                DType *gradIn_data) {
-                        DType normal_val = (*inputDataPtr - mean) * k;
-                        *gradIn_data += (*gradOut_data - gradMean -
-                            normal_val) * iw;
-                      });
+          ForEachFast(
+              inputData,
+              gradOut,
+              gradIn,
+              static_cast<size_t>(channel),
+              [&mean, &k, iw, gradMean](
+                  const DType* inputDataPtr, const DType* gradOut_data, DType* gradIn_data) {
+                DType normal_val = (*inputDataPtr - mean) * k;
+                *gradIn_data += (*gradOut_data - gradMean - normal_val) * iw;
+              });
         }
       } else {
         // when in evaluation mode
@@ -327,13 +337,17 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
         // dL/dX = w / running_std
         const AccReal iw = invstd * w;
         if (req[batchnorm::kData] != kAddTo) {
-          ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
-                      [iw](const DType *gradOut_data, DType *gradIn_data) {
+          ForEachFast(gradOut,
+                      gradIn,
+                      static_cast<size_t>(channel),
+                      [iw](const DType* gradOut_data, DType* gradIn_data) {
                         *gradIn_data = *gradOut_data * iw;
                       });
         } else {
-          ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
-                      [iw](const DType *gradOut_data, DType *gradIn_data) {
+          ForEachFast(gradOut,
+                      gradIn,
+                      static_cast<size_t>(channel),
+                      [iw](const DType* gradOut_data, DType* gradIn_data) {
                         *gradIn_data += *gradOut_data * iw;
                       });
         }
@@ -358,28 +372,27 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
 DMLC_REGISTER_PARAMETER(BatchNormParam);
 
 static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
-                           mxnet::ShapeVector *in_shape,
-                           mxnet::ShapeVector *out_shape) {
+                           mxnet::ShapeVector* in_shape,
+                           mxnet::ShapeVector* out_shape) {
   const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   using namespace mshadow;
   CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]";
   CHECK_EQ(out_shape->size(), 3U);
-  const mxnet::TShape &dshape = in_shape->at(batchnorm::kData);
+  const mxnet::TShape& dshape = in_shape->at(batchnorm::kData);
   if (!mxnet::ndim_is_known(dshape)) {
     return false;
   }
 
-  const size_t channelAxis = static_cast<size_t>(param.axis < 0
-      ? static_cast<int>(dshape.ndim()) + param.axis
-      : param.axis);
+  const size_t channelAxis = static_cast<size_t>(
+      param.axis < 0 ? static_cast<int>(dshape.ndim()) + param.axis : param.axis);
   CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis;
 
   const index_t channelCount = dshape[channelAxis];
 
-  in_shape->at(batchnorm::kGamma) = mxnet::TShape(Shape1(channelCount));
-  in_shape->at(batchnorm::kBeta) = mxnet::TShape(Shape1(channelCount));
+  in_shape->at(batchnorm::kGamma)        = mxnet::TShape(Shape1(channelCount));
+  in_shape->at(batchnorm::kBeta)         = mxnet::TShape(Shape1(channelCount));
   in_shape->at(batchnorm::kInMovingMean) = mxnet::TShape(Shape1(channelCount));  // kMovingMean
-  in_shape->at(batchnorm::kInMovingVar) = mxnet::TShape(Shape1(channelCount));  // kMovingVar
+  in_shape->at(batchnorm::kInMovingVar)  = mxnet::TShape(Shape1(channelCount));  // kMovingVar
 
   out_shape->clear();
   out_shape->push_back(dshape);                // kOut
@@ -390,32 +403,33 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
 }
 
 static bool BatchNormType(const nnvm::NodeAttrs& attrs,
-                          std::vector<int> *in_type, std::vector<int> *out_type) {
+                          std::vector<int>* in_type,
+                          std::vector<int>* out_type) {
   using namespace mshadow;
   CHECK_GE(in_type->size(), 1U);
   const size_t n_out = 3;
-  // For float16 input type beta, gamma, mean, and average are stored in float32.
-  // For other input types, these parameters have the same type as input
-  // NOTE: This requirement is from cuDNN (v. 4 and 5)
+  // For float16 input type beta, gamma, mean, and average are stored in
+  // float32. For other input types, these parameters have the same type as
+  // input NOTE: This requirement is from cuDNN (v. 4 and 5)
   int dtype_param;
   int dtype = (*in_type)[0];
   if (type_is_none(dtype)) {
     // Input type is undefined, we try backward inference
-     if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
-       // Neither the input nor the output are defined,
-       // types cannot be infered for this op
-       return false;
-     } else {
-       // Input type is undefined but output type is: backward inference
-       dtype = (*out_type)[0];
-       (*in_type)[0] = dtype;
-       MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
-         dtype_param = mshadow::DataType<AccRealX>::kFlag; });
-     }
+    if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
+      // Neither the input nor the output are defined,
+      // types cannot be infered for this op
+      return false;
+    } else {
+      // Input type is undefined but output type is: backward inference
+      dtype         = (*out_type)[0];
+      (*in_type)[0] = dtype;
+      MSHADOW_REAL_TYPE_SWITCH_EX(
+          dtype, DTypeX, AccRealX, { dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+    }
   } else {
     // Input type is defined but output type is not: forward inference
-    MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
-      dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+    MSHADOW_REAL_TYPE_SWITCH_EX(
+        dtype, DTypeX, AccRealX, { dtype_param = mshadow::DataType<AccRealX>::kFlag; });
     out_type->clear();
     out_type->push_back(dtype);
     for (size_t i = 1; i < n_out; ++i) {
@@ -435,29 +449,30 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
 }
 
 #if MXNET_USE_MKLDNN == 1
-static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam &param) {
-  if (mxnet::op::batchnorm::disable_mkl) return false;
+static inline bool SupportMKLDNNBN(const NDArray& input, const BatchNormParam& param) {
+  if (mxnet::op::batchnorm::disable_mkl)
+    return false;
   const mxnet::TShape shape = input.shape();
-  const int ndim = shape.ndim();
-  if (ndim == 0 || shape.Size() == 0) return false;
+  const int ndim            = shape.ndim();
+  if (ndim == 0 || shape.Size() == 0)
+    return false;
   const int dtype = input.dtype();
-  return (dtype == mshadow::kFloat32 ||
-          dtype == mshadow::kBfloat16) &&
-          SupportStorageMKLDNN(input.storage_type());
+  return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
+         SupportStorageMKLDNN(input.storage_type());
 }
 
-void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
-                           const OpContext &ctx,
-                           const std::vector<NDArray> &inputs,
-                           const std::vector<OpReqType> &req,
-                           const std::vector<NDArray> &outputs) {
+void BatchNormComputeExCPU(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
   CHECK_EQ(inputs.size(), 5U);
-  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
-  bool fuse_relu = false;
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  bool fuse_relu              = false;
   if (SupportMKLDNNBN(inputs[0], param)) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     MKLDNN_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
-        MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
+      MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
     });
     MKLDNN_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
@@ -465,52 +480,53 @@ void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
   FallBackCompute(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
 }
 
-void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs,
-                               const OpContext &ctx,
-                               const std::vector<NDArray> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<NDArray> &outputs) {
-  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
-  bool fuse_relu = false;
+void BatchNormGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+  bool fuse_relu              = false;
   if (SupportMKLDNNBN(inputs[0], param)) {
-      MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
-      MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
-      MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
-      return;
+    MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+    MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
+    MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+    return;
   }
   FallBackCompute(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
 }
 #endif
 
-static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs,
+static inline bool BatchNormStorageType(const nnvm::NodeAttrs& attrs,
                                         const int dev_mask,
-                                        DispatchMode *dispatch_mode,
-                                        std::vector<int> *in_attrs,
-                                        std::vector<int> *out_attrs) {
-  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int>* in_attrs,
+                                        std::vector<int>* out_attrs) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
 
   bool dispatched = false;
 #if MXNET_USE_MKLDNN == 1
   if (!dispatched) {
-    dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode,
-                                   in_attrs, out_attrs);
+    dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
   }
   if (!MKLDNNEnvSet()) {
     *dispatch_mode = DispatchMode::kFComputeFallback;
   }
 #else
   for (int& v : *in_attrs)
-    if (v == - 1) v = kDefaultStorage;
+    if (v == -1)
+      v = kDefaultStorage;
   if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
-    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
+    dispatched =
+        storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
   }
   if (!dispatched) {
     dispatched = dispatch_fallback(out_attrs, dispatch_mode);
   }
 #endif
   if (!common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) && param.fix_gamma) {
-    LOG(FATAL) << "fix_gamma=True is not supported for sparse ndarrays. Tracked at #11647";
+    LOG(FATAL) << "fix_gamma=True is not supported for sparse ndarrays. "
+                  "Tracked at #11647";
   }
   return dispatched;
 }
@@ -533,10 +549,10 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
   heads.emplace_back(n->inputs.at(batchnorm::kInMovingVar));
 
   nnvm::ObjectPtr gnode = nnvm::Node::Create();
-  gnode->inputs = std::move(heads);
+  gnode->inputs         = std::move(heads);
   gnode->control_deps.emplace_back(n);
-  gnode->attrs = n->attrs;
-  gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
+  gnode->attrs      = n->attrs;
+  gnode->attrs.op   = nnvm::Op::Get("_backward_BatchNorm");
   gnode->attrs.name = n->attrs.name + "_backward";
   // The input of batchnorm
   std::vector<nnvm::NodeEntry> in_grad;
@@ -545,8 +561,8 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
     in_grad.emplace_back(gnode, i, 0);
   // attach no gradient node to forbid gradient on aux_state
   nnvm::ObjectPtr ng = nnvm::Node::Create();
-  ng->attrs.op = Op::Get("_NoGradient");
-  ng->attrs.name = "NoGradient";
+  ng->attrs.op       = Op::Get("_NoGradient");
+  ng->attrs.name     = "NoGradient";
   // the aux state of batchnorm
   for (size_t i = 3; i < 5; ++i)
     in_grad.emplace_back(ng);
@@ -554,8 +570,8 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
 }
 
 NNVM_REGISTER_OP(BatchNorm)
-.add_alias("_npx_batch_norm")
-.describe(R"code(Batch normalization.
+    .add_alias("_npx_batch_norm")
+    .describe(R"code(Batch normalization.
 
 Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
 well as offset ``beta``.
@@ -605,75 +621,82 @@ then set ``gamma`` to 1 and its gradient to 0.
   the sparse tensors will fallback.
 
 )code" ADD_FILELINE)
-.set_num_inputs(5)
-.set_num_outputs(3)
-.set_attr_parser(ParamParser<BatchNormParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
-    [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"output", "mean", "var"};
-})
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
-    [](const NodeAttrs& attrs) {
-  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
-  return param.output_mean_var ? 3 : 1;
-})
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
-  return std::vector<uint32_t>{3, 4};
-})
-.set_attr<mxnet::FInferShape>("FInferShape", BatchNormShape)
-.set_attr<nnvm::FInferType>("FInferType", BatchNormType)
-.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
-.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
+    .set_num_inputs(5)
+    .set_num_outputs(3)
+    .set_attr_parser(ParamParser<BatchNormParam>)
+    .set_attr<nnvm::FListInputNames>(
+        "FListInputNames",
+        [](const NodeAttrs& attrs) {
+          return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
+        })
+    .set_attr<nnvm::FListOutputNames>("FListOutputNames",
+                                      [](const NodeAttrs& attrs) {
+                                        return std::vector<std::string>{"output", "mean", "var"};
+                                      })
+    .set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+                                        [](const NodeAttrs& attrs) {
+                                          const BatchNormParam& param =
+                                              nnvm::get<BatchNormParam>(attrs.parsed);
+                                          return param.output_mean_var ? 3 : 1;
+                                        })
+    .set_attr<nnvm::FMutateInputs>("FMutateInputs",
+                                   [](const nnvm::NodeAttrs& attrs) {
+                                     return std::vector<uint32_t>{3, 4};
+                                   })
+    .set_attr<mxnet::FInferShape>("FInferShape", BatchNormShape)
+    .set_attr<nnvm::FInferType>("FInferType", BatchNormType)
+    .set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
+    .set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
 #if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
 #endif
-.set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
+    .set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
 #if MXNET_USE_MKLDNN == 1
-.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
+    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& n) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
 #endif
-.add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
-.add_argument("gamma", "NDArray-or-Symbol", "gamma array")
-.add_argument("beta", "NDArray-or-Symbol", "beta array")
-.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
-.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
-.add_arguments(BatchNormParam::__FIELDS__())
-.set_attr<nnvm::FSetInputVarAttrOnCompose>(
-  "FSetInputVarAttrOnCompose",
-  [](const nnvm::NodeAttrs& attrs, nnvm::ObjectPtr var, const int index) {
-    if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
-    if (index == 3) {
-      var->attrs.dict["__init__"] = "[\"zero\", {}]";
-    } else if (index == 4) {
-      var->attrs.dict["__init__"] = "[\"one\", {}]";
-    }
-  });
+    .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
+    .add_argument("gamma", "NDArray-or-Symbol", "gamma array")
+    .add_argument("beta", "NDArray-or-Symbol", "beta array")
+    .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
+    .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
+    .add_arguments(BatchNormParam::__FIELDS__())
+    .set_attr<nnvm::FSetInputVarAttrOnCompose>(
+        "FSetInputVarAttrOnCompose",
+        [](const nnvm::NodeAttrs& attrs, nnvm::ObjectPtr var, const int index) {
+          if (var->attrs.dict.find("__init__") != var->attrs.dict.end())
+            return;
+          if (index == 3) {
+            var->attrs.dict["__init__"] = "[\"zero\", {}]";
+          } else if (index == 4) {
+            var->attrs.dict["__init__"] = "[\"one\", {}]";
+          }
+        });
 
 NNVM_REGISTER_OP(_backward_BatchNorm)
-.set_num_inputs(8)
-.set_num_outputs(3)
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
-  return std::vector<uint32_t>{6, 7};   // moving_mean, moving_var
-})
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
+    .set_num_inputs(8)
+    .set_num_outputs(3)
+    .set_attr<nnvm::FMutateInputs>("FMutateInputs",
+                                   [](const nnvm::NodeAttrs& attrs) {
+                                     return std::vector<uint32_t>{6, 7};  // moving_mean, moving_var
+                                   })
+    .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+    .set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
 #if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
+    .set_attr<FResourceRequest>("FResourceRequest",
+                                [](const NodeAttrs& n) {
+                                  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                                })
 #endif
-.set_attr_parser(ParamParser<BatchNormParam>)
+    .set_attr_parser(ParamParser<BatchNormParam>)
 #if MXNET_USE_MKLDNN == 1
-.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
+    .set_attr<bool>("TIsMKLDNN", true)
+    .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
 #endif
-.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
+    .set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 40d677a..0c70f85 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -47,10 +47,6 @@
 
 using namespace mxnet;
 
-/*! \brief inverse standard deviation <-> variance */
-#define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$)   ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
-
 namespace mxnet {
 namespace op {
 namespace batchnorm {
diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h
index 70bf16a..ad5d70e 100644
--- a/src/operator/nn/mkldnn/mkldnn_act-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h
@@ -22,17 +22,17 @@
  * \file mkldnn_act-inl.h
  * \brief MKLDNN Activation operator
  * /author Zhiyuan Huang
-*/
+ */
 
 #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
 #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
 
-
 #if MXNET_USE_MKLDNN == 1
-#include <vector>
 #include <utility>
-#include "../activation-inl.h"
+#include <vector>
+
 #include "../../leaky_relu-inl.h"
+#include "../activation-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -42,53 +42,56 @@ struct MKLDNNActParam {
   float slope = 0.f;
 
   bool operator==(const MKLDNNActParam& other) const {
-    return this->alg == other.alg &&
-           this->slope == other.slope;
+    return this->alg == other.alg && this->slope == other.slope;
   }
 };
 
 mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param);
 mkldnn::algorithm GetMKLDNNActAlgo(const LeakyReLUParam& param);
 
-mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
-    const MKLDNNActParam& param, bool is_train,
-    const mkldnn::memory &input_mem);
+mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(const MKLDNNActParam& param,
+                                                          bool is_train,
+                                                          const mkldnn::memory& input_mem);
 
 class MKLDNNActForward {
  public:
   const mkldnn::eltwise_forward::primitive_desc fwd_pd;
 
-  MKLDNNActForward(const MKLDNNActParam& param, bool is_train,
-                   const NDArray &data, const mkldnn::memory &mem): fwd_pd(
-                       GetActFwdDescImpl(param, is_train, mem)) {
+  MKLDNNActForward(const MKLDNNActParam& param,
+                   bool is_train,
+                   const NDArray& data,
+                   const mkldnn::memory& mem)
+      : fwd_pd(GetActFwdDescImpl(param, is_train, mem)) {
     fwd_ = std::make_shared<mkldnn::eltwise_forward>(fwd_pd);
   }
-  const inline mkldnn::eltwise_forward &GetFwd() const;
+  const inline mkldnn::eltwise_forward& GetFwd() const;
 
  private:
   std::shared_ptr<mkldnn::eltwise_forward> fwd_;
 };
 
 typedef ParamOpSign<MKLDNNActParam> MKLDNNActSignature;
-MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
-                                const OpContext &ctx, const NDArray &in_data,
-                                const mkldnn::memory &in_mem);
+MKLDNNActForward& GetActForward(const MKLDNNActParam& param,
+                                const OpContext& ctx,
+                                const NDArray& in_data,
+                                const mkldnn::memory& in_mem);
 
-mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(
-    const MKLDNNActParam &param, const mkldnn::memory &input_mem,
-    const mkldnn::memory &diff_dst_memory);
+mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(const MKLDNNActParam& param,
+                                                           const mkldnn::memory& input_mem,
+                                                           const mkldnn::memory& diff_dst_memory);
 
 class MKLDNNActBackward {
  public:
   const mkldnn::eltwise_backward::primitive_desc bwd_pd;
 
-  explicit MKLDNNActBackward(const MKLDNNActParam &param, const NDArray &data,
-                             const mkldnn::memory &mem,
-                             const mkldnn::memory &diff_dst_memory): bwd_pd(
-                                 GetActBwdDescImpl(param, mem, diff_dst_memory)) {
+  explicit MKLDNNActBackward(const MKLDNNActParam& param,
+                             const NDArray& data,
+                             const mkldnn::memory& mem,
+                             const mkldnn::memory& diff_dst_memory)
+      : bwd_pd(GetActBwdDescImpl(param, mem, diff_dst_memory)) {
     bwd_prim_ = std::make_shared<mkldnn::eltwise_backward>(bwd_pd);
   }
-  const inline mkldnn::eltwise_backward &GetBwd() const;
+  const inline mkldnn::eltwise_backward& GetBwd() const;
 
  private:
   std::shared_ptr<mkldnn::eltwise_backward> bwd_prim_;
@@ -97,12 +100,12 @@ class MKLDNNActBackward {
 }  // namespace mxnet
 
 namespace std {
-template<>
+template <>
 struct hash<mxnet::op::MKLDNNActParam> {
   size_t operator()(const mxnet::op::MKLDNNActParam& val) {
     size_t ret = 0;
-    ret = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
-    ret = dmlc::HashCombine(ret, val.slope);
+    ret        = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
+    ret        = dmlc::HashCombine(ret, val.slope);
     return ret;
   }
 };
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index 29ff8d9..6f4ac3d 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -21,57 +21,54 @@
  * \file mkldnn_act.cc
  * \brief
  * \author Da Zheng
-*/
+ */
 
 #if MXNET_USE_MKLDNN == 1
 
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
+
 #include <algorithm>
 #include <map>
-#include <vector>
 #include <string>
 #include <utility>
+#include <vector>
+
 #include "../../operator_common.h"
-#include "mkldnn_act-inl.h"
 #include "./mkldnn_base-inl.h"
+#include "mkldnn_act-inl.h"
 
 namespace mxnet {
 namespace op {
 
 bool SupportMKLDNNAct(const ActivationParam& param) {
-  return param.act_type == activation::kReLU
-      || param.act_type == activation::kSigmoid
-      || param.act_type == activation::kSoftReLU
-      || param.act_type == activation::kTanh;
+  return param.act_type == activation::kReLU || param.act_type == activation::kSigmoid ||
+         param.act_type == activation::kSoftReLU || param.act_type == activation::kTanh;
 }
 
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input) {
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input) {
   // MKL-DNN Activation supports 1d, 2d, 3d, 4d and 5d data layout
-  if ((input.shape().ndim() < 1) ||
-      (input.shape().ndim() > 5) ||
+  if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
       !(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
     return false;
   return SupportMKLDNNAct(param);
 }
 
 bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param) {
-  return param.act_type == leakyrelu::kLeakyReLU
-      || param.act_type == leakyrelu::kELU
-      || param.act_type == leakyrelu::kGELU;
+  return param.act_type == leakyrelu::kLeakyReLU || param.act_type == leakyrelu::kELU ||
+         param.act_type == leakyrelu::kGELU;
 }
 
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray &input) {
+bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input) {
   // MKL-DNN Activation supports 1d, 2d, 3d, 4d and 5d data layout
-  if ((input.shape().ndim() < 1) ||
-      (input.shape().ndim() > 5) ||
+  if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
       !(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
     return false;
   return SupportMKLDNNLeakyRelu(param);
 }
 
-bool SupportQuantizedMKLDNNAct(const ActivationParam &param) {
+bool SupportQuantizedMKLDNNAct(const ActivationParam& param) {
   // TODO(zhennan): Add more activation type when mkldnn supports.
   //                Remove this when it's identity to SupportMKLDNNAct.
   return param.act_type == activation::kReLU;
@@ -107,26 +104,26 @@ mkldnn::algorithm GetMKLDNNActAlgo(const LeakyReLUParam& param) {
   }
 }
 
-mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
-    const MKLDNNActParam& param, bool is_train,
-    const mkldnn::memory &input_mem) {
+mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(const MKLDNNActParam& param,
+                                                          bool is_train,
+                                                          const mkldnn::memory& input_mem) {
   mkldnn::memory::desc data_md = input_mem.get_desc();
-  auto cpu_engine = CpuEngine::Get()->get_engine();
-  auto alg = param.alg;
+  auto cpu_engine              = CpuEngine::Get()->get_engine();
+  auto alg                     = param.alg;
 
-  auto prop = is_train ? mkldnn::prop_kind::forward_training :
-                         mkldnn::prop_kind::forward_scoring;
+  auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
   auto desc = mkldnn::eltwise_forward::desc(prop, alg, data_md, param.slope);
   return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
 }
 
-const inline mkldnn::eltwise_forward &MKLDNNActForward::GetFwd() const {
+const inline mkldnn::eltwise_forward& MKLDNNActForward::GetFwd() const {
   return *fwd_;
 }
 
-MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
-                                const OpContext &ctx, const NDArray &in_data,
-                                const mkldnn::memory &in_mem) {
+MKLDNNActForward& GetActForward(const MKLDNNActParam& param,
+                                const OpContext& ctx,
+                                const NDArray& in_data,
+                                const mkldnn::memory& in_mem) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, OpHash> fwds;
 #else
@@ -145,72 +142,75 @@ MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
   return it->second;
 }
 
-void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                             const NDArray &in_data, const OpReqType &req,
-                             const NDArray &out_data) {
+void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const NDArray& in_data,
+                             const OpReqType& req,
+                             const NDArray& out_data) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   MKLDNNActParam param_;
-  param_.alg = GetMKLDNNActAlgo(param);
+  param_.alg               = GetMKLDNNActAlgo(param);
   const NDArray& in_buffer = in_data;
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  auto input_mem = in_buffer.GetMKLDNNData();
-  MKLDNNActForward &fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
-  auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+  MKLDNNStream* stream     = MKLDNNStream::Get();
+  auto input_mem           = in_buffer.GetMKLDNNData();
+  MKLDNNActForward& fwd    = GetActForward(param_, ctx, in_buffer, *input_mem);
+  auto out_mem_t           = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
   stream->RegisterPrimArgs(fwd.GetFwd(),
-                           {{ MKLDNN_ARG_SRC, *input_mem}, { MKLDNN_ARG_DST, *out_mem_t.second}});
+                           {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem_t.second}});
   CommitOutput(out_data, out_mem_t);
   stream->Submit();
 }
 
-void MKLDNNLeakyReluForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                             const NDArray &in_data, const OpReqType &req,
-                             const NDArray &out_data) {
+void MKLDNNLeakyReluForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const NDArray& in_data,
+                            const OpReqType& req,
+                            const NDArray& out_data) {
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
   MKLDNNActParam param_;
-  param_.alg = GetMKLDNNActAlgo(param);
+  param_.alg   = GetMKLDNNActAlgo(param);
   param_.slope = param.slope;
 
-  NDArray in_buffer = in_data;
-  MKLDNNStream *stream = MKLDNNStream::Get();
+  NDArray in_buffer    = in_data;
+  MKLDNNStream* stream = MKLDNNStream::Get();
 
   if (in_data.IsView() && in_data.IsMKLDNNData())
     in_buffer = in_data.Reorder2Default();
 
-  auto input_mem = in_buffer.GetMKLDNNData();
-  MKLDNNActForward &fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
-  auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+  auto input_mem        = in_buffer.GetMKLDNNData();
+  MKLDNNActForward& fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
+  auto out_mem_t        = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
   stream->RegisterPrimArgs(fwd.GetFwd(),
-                           {{ MKLDNN_ARG_SRC, *input_mem}, { MKLDNN_ARG_DST, *out_mem_t.second}});
+                           {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem_t.second}});
   CommitOutput(out_data, out_mem_t);
   stream->Submit();
 }
 
-mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(
-    const MKLDNNActParam &param, const mkldnn::memory &input_mem,
-    const mkldnn::memory &diff_dst_memory) {
+mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(const MKLDNNActParam& param,
+                                                           const mkldnn::memory& input_mem,
+                                                           const mkldnn::memory& diff_dst_memory) {
   mkldnn::memory::desc data_md = input_mem.get_desc();
   mkldnn::memory::desc diff_md = diff_dst_memory.get_desc();
-  auto cpu_engine = CpuEngine::Get()->get_engine();
-  auto alg = param.alg;
+  auto cpu_engine              = CpuEngine::Get()->get_engine();
+  auto alg                     = param.alg;
 
-  mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training,
-                                        alg, data_md, param.slope);
+  mkldnn::eltwise_forward::desc fw_desc(
+      mkldnn::prop_kind::forward_training, alg, data_md, param.slope);
   mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
   mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, param.slope);
-  mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine,
-                                                    fw_pdesc);
+  mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc);
   return bw_pdesc;
 }
 
-const inline mkldnn::eltwise_backward &MKLDNNActBackward::GetBwd() const {
+const inline mkldnn::eltwise_backward& MKLDNNActBackward::GetBwd() const {
   return *bwd_prim_;
 }
 
-static inline MKLDNNActBackward &GetActBackward(const MKLDNNActParam &param,
-                                                const OpContext &ctx,
-                                                const NDArray &in_data,
-                                                const NDArray &out_grad,
-                                                const mkldnn::memory &in_mem) {
+static inline MKLDNNActBackward& GetActBackward(const MKLDNNActParam& param,
+                                                const OpContext& ctx,
+                                                const NDArray& in_data,
+                                                const NDArray& out_grad,
+                                                const mkldnn::memory& in_mem) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActBackward, OpHash> bwds;
 #else
@@ -228,38 +228,38 @@ static inline MKLDNNActBackward &GetActBackward(const MKLDNNActParam &param,
   return it->second;
 }
 
-// For backward relu activation, it's okay to pass "out_data" as "in_data" to this
-// function, since the computation only involes non-zeros.
-void MKLDNNActivationBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                              const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
-                              const std::vector<NDArray> &outputs) {
+// For backward relu activation, it's okay to pass "out_data" as "in_data" to
+// this function, since the computation only involes non-zeros.
+void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
   if (req[0] == kNullOp) {
     return;
   }
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   // XXX: for y = relu(x), y is passed as "in_data" to Backward()
-  const bool relu = param.act_type == activation::kReLU;
-  const NDArray &out_buffer = inputs[0];
-  const NDArray &in_buffer = relu ? inputs[1] : inputs[2];
-  const NDArray &in_grad = outputs[0];
+  const bool relu           = param.act_type == activation::kReLU;
+  const NDArray& out_buffer = inputs[0];
+  const NDArray& in_buffer  = relu ? inputs[1] : inputs[2];
+  const NDArray& in_grad    = outputs[0];
   MKLDNNActParam param_;
   param_.alg = GetMKLDNNActAlgo(param);
   TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
   auto diff_dst_memory = out_buffer.GetMKLDNNData();
-  auto input_mem = in_buffer.GetMKLDNNData();
+  auto input_mem       = in_buffer.GetMKLDNNData();
   // We need to make sure the two inputs to eltwise_backward has the same memory
   // descriptor. Otherwise, the perf will suffer.
   if (input_mem->get_desc() != diff_dst_memory->get_desc())
     input_mem = in_buffer.GetMKLDNNDataReorder(diff_dst_memory->get_desc());
-  MKLDNNActBackward &bwd =
-      GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  mkldnn_output_t diff_src_memory =
-      CreateMKLDNNMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
-  mkldnn_args_map_t args = {
-    { MKLDNN_ARG_SRC, *input_mem },
-    { MKLDNN_ARG_DIFF_DST, *diff_dst_memory },
-    { MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second },
+  MKLDNNActBackward& bwd          = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+  MKLDNNStream* stream            = MKLDNNStream::Get();
+  mkldnn_output_t diff_src_memory = CreateMKLDNNMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
+  mkldnn_args_map_t args          = {
+      {MKLDNN_ARG_SRC, *input_mem},
+      {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
+      {MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second},
   };
   stream->RegisterPrimArgs(bwd.GetBwd(), args);
   CommitOutput(in_grad, diff_src_memory);
@@ -267,40 +267,38 @@ void MKLDNNActivationBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx
 }
 
 void MKLDNNLeakyReluBackward(const nnvm::NodeAttrs& attrs,
-                             const OpContext &ctx,
+                             const OpContext& ctx,
                              const std::vector<NDArray>& inputs,
-                             const std::vector<OpReqType> &req,
-                             const std::vector<NDArray> &outputs) {
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs) {
   if (req[0] == kNullOp) {
     return;
   }
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
   const NDArray& out_buffer = inputs[0];
-  const NDArray& in_buffer = inputs[1];
-  const NDArray &output = outputs[0];
+  const NDArray& in_buffer  = inputs[1];
+  const NDArray& output     = outputs[0];
 
   const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
   MKLDNNActParam param_;
-  param_.alg = GetMKLDNNActAlgo(param);
+  param_.alg   = GetMKLDNNActAlgo(param);
   param_.slope = param.slope;
 
   TmpMemMgr::Get()->Init(ctx.requested[leakyrelu::kRandom]);
   auto diff_dst_memory = out_buffer.GetMKLDNNData();
-  auto input_mem = in_buffer.GetMKLDNNData();
+  auto input_mem       = in_buffer.GetMKLDNNData();
   // We need to make sure the two inputs to eltwise_backward has the same memory
   // descriptor. Otherwise, the perf will suffer.
   if (input_mem->get_desc() != diff_dst_memory->get_desc())
     input_mem = in_buffer.GetMKLDNNDataReorder(diff_dst_memory->get_desc());
-  MKLDNNActBackward &bwd =
-      GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  mkldnn_output_t diff_src_memory =
-      CreateMKLDNNMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
-  mkldnn_args_map_t args = {
-    { MKLDNN_ARG_SRC, *input_mem },
-    { MKLDNN_ARG_DIFF_DST, *diff_dst_memory },
-    { MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second },
+  MKLDNNActBackward& bwd          = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+  MKLDNNStream* stream            = MKLDNNStream::Get();
+  mkldnn_output_t diff_src_memory = CreateMKLDNNMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
+  mkldnn_args_map_t args          = {
+      {MKLDNN_ARG_SRC, *input_mem},
+      {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
+      {MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second},
   };
   stream->RegisterPrimArgs(bwd.GetBwd(), args);
   CommitOutput(output, diff_src_memory);
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index cb30b0b..48c7445 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -18,30 +18,30 @@
  */
 
 /*******************************************************************************
-* Copyright 2016-2017 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkldnn_base-inl.h
-* \brief
-* \author young.jin.kim@intel.com
-*         ashok.emani@intel.com
-*         deepthi.karkada@intel.com
-*         louis.feng@intel.com
-*         adam.d.straw@intel.com
-*         zhengda1936@gmail.com
-*
-*******************************************************************************/
+ * Copyright 2016-2017 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * \file mkldnn_base-inl.h
+ * \brief
+ * \author young.jin.kim@intel.com
+ *         ashok.emani@intel.com
+ *         deepthi.karkada@intel.com
+ *         louis.feng@intel.com
+ *         adam.d.straw@intel.com
+ *         zhengda1936@gmail.com
+ *
+ *******************************************************************************/
 
 #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
 #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
@@ -54,28 +54,25 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "mkldnn.hpp"
 #include "mxnet/graph_attr_types.h"
 #include "mxnet/ndarray.h"
 #include "mxnet/op_attr_types.h"
 #include "mxnet/resource.h"
 
-#define MKLDNN_REAL_TYPE_SWITCH(type, DType, ...)   \
-  switch (type) {                                   \
-  case mshadow::kFloat32:                           \
-    {                                               \
-      typedef float DType;                          \
-      {__VA_ARGS__}                                 \
-    }                                               \
-    break;                                          \
-  case mshadow::kBfloat16:                          \
-    {                                               \
-      typedef mshadow::bfloat::bf16_t DType;        \
-      {__VA_ARGS__}                                 \
-    }                                               \
-    break;                                          \
-  default:                                          \
-    LOG(FATAL) << "Unknown type enum " << type;     \
+#define MKLDNN_REAL_TYPE_SWITCH(type, DType, ...) \
+  switch (type) {                                 \
+    case mshadow::kFloat32: {                     \
+      typedef float DType;                        \
+      { __VA_ARGS__ }                             \
+    } break;                                      \
+    case mshadow::kBfloat16: {                    \
+      typedef mshadow::bfloat::bf16_t DType;      \
+      { __VA_ARGS__ }                             \
+    } break;                                      \
+    default:                                      \
+      LOG(FATAL) << "Unknown type enum " << type; \
   }
 
 namespace mxnet {
@@ -84,18 +81,20 @@ namespace mxnet {
 // cpu_engine singleton
 class CpuEngine {
  public:
-  static CpuEngine *Get() {
+  static CpuEngine* Get() {
     // I's thread-safe in C++11.
     // ensure same mkldnn engine is used across threads
     static CpuEngine myInstance;
     return &myInstance;
   }
-  CpuEngine(CpuEngine const &) = delete;             // Copy construct
-  CpuEngine(CpuEngine &&) = delete;                  // Move construct
-  CpuEngine &operator=(CpuEngine const &) = delete;  // Copy assign
-  CpuEngine &operator=(CpuEngine &&) = delete;       // Move assign
+  CpuEngine(CpuEngine const&) = delete;             // Copy construct
+  CpuEngine(CpuEngine&&)      = delete;             // Move construct
+  CpuEngine& operator=(CpuEngine const&) = delete;  // Copy assign
+  CpuEngine& operator=(CpuEngine&&) = delete;       // Move assign
 
-  mkldnn::engine &get_engine() { return _cpu_engine; }
+  mkldnn::engine& get_engine() {
+    return _cpu_engine;
+  }
 
  protected:
   CpuEngine() : _cpu_engine(mkldnn::engine::kind::cpu, 0) {}
@@ -134,10 +133,10 @@ struct data_type_enum<uint8_t> {
   enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::u8) };
 };
 
-static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape &shape) {
-  int ndim = shape.ndim();
+static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape& shape) {
+  int ndim     = shape.ndim();
   bool support = ndim == 1 || ndim == 2 || ndim == 4;
-  support = support &&
+  support      = support &&
             (dtype == mshadow::kFloat32 || dtype == mshadow::kInt32 || dtype == mshadow::kInt8 ||
              dtype == mshadow::kUint8 || dtype == mshadow::kBfloat16);
   return support;
@@ -147,24 +146,23 @@ static inline bool SupportStorageMKLDNN(int stype) {
   return stype == kDefaultStorage;
 }
 
-static inline bool SupportMKLDNN(int dtype, const mxnet::TShape &shape) {
+static inline bool SupportMKLDNN(int dtype, const mxnet::TShape& shape) {
   int ndim = shape.ndim();
   if (ndim == 0 || shape.Size() == 0) {
     // MKLDNN currently does not support 0-dim Tensor and 0-size Tensor
     return false;
   }
   return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
-                    (ndim == 1 || ndim == 2 || ndim == 4);
+         (ndim == 1 || ndim == 2 || ndim == 4);
 }
 
 static inline bool SupportMKLDNNQuantize(int dtype) {
-  return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 ||
-         dtype == mshadow::kUint8 || dtype == mshadow::kBfloat16;
+  return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 || dtype == mshadow::kUint8 ||
+         dtype == mshadow::kBfloat16;
 }
 
-static inline bool SupportMKLDNN(const NDArray &input) {
-  return SupportMKLDNN(input.dtype(), input.shape())
-      && SupportStorageMKLDNN(input.storage_type());
+static inline bool SupportMKLDNN(const NDArray& input) {
+  return SupportMKLDNN(input.dtype(), input.shape()) && SupportStorageMKLDNN(input.storage_type());
 }
 
 static inline bool MKLDNNEnvSet() {
@@ -177,10 +175,12 @@ static inline int GetMKLDNNCacheSize() {
   return mkldnn_cache_size;
 }
 
-// TODO(alex): (MXNET-1075) Will remove env variable and calculate cache size during runtime
-template<typename S, typename I, typename H>
-static typename std::unordered_map<S, I, H>::iterator AddToCache(
-    std::unordered_map<S, I, H>* cache, const S &key, const I &item) {
+// TODO(alex): (MXNET-1075) Will remove env variable and calculate cache size
+// during runtime
+template <typename S, typename I, typename H>
+static typename std::unordered_map<S, I, H>::iterator AddToCache(std::unordered_map<S, I, H>* cache,
+                                                                 const S& key,
+                                                                 const I& item) {
   int mkldnn_cache_size = GetMKLDNNCacheSize();
   if (mkldnn_cache_size != -1 && static_cast<int>(cache->size()) > mkldnn_cache_size)
     cache->erase(cache->begin());
@@ -192,7 +192,7 @@ static typename std::unordered_map<S, I, H>::iterator AddToCache(
 /*
  * This is to align address to a certain alignment.
  */
-void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space);
+void* AlignMem(void* mem, size_t size, size_t alignment, size_t* space);
 
 namespace op {
 struct ActivationParam;
@@ -204,29 +204,28 @@ struct SoftmaxOutputParam;
 struct TransposeParam;
 struct ReshapeParam;
 bool SupportMKLDNNAct(const ActivationParam& param);
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input);
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input);
 bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param);
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray &input);
-bool SupportQuantizedMKLDNNAct(const ActivationParam &param);
-bool SupportMKLDNNConv(const ConvolutionParam &params, const NDArray &input);
-bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
-bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray &input, const NDArray &output);
-bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param, const NDArray &input,
-                             const NDArray &output);
-bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam &param);
-bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray &data);
-bool SupportMKLDNNBatchDot(const std::vector<NDArray> &inputs, const NDArray &output);
+bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input);
+bool SupportQuantizedMKLDNNAct(const ActivationParam& param);
+bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input);
+bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input);
+bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
+bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param,
+                             const NDArray& input,
+                             const NDArray& output);
+bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam& param);
+bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray& data);
+bool SupportMKLDNNBatchDot(const std::vector<NDArray>& inputs, const NDArray& output);
 }  // namespace op
 
 static int GetTypeSize(int dtype) {
   int size = -1;
-  MSHADOW_TYPE_SWITCH(dtype, DType, {
-    size = sizeof(DType);
-  });
+  MSHADOW_TYPE_SWITCH(dtype, DType, { size = sizeof(DType); });
   return size;
 }
 
-static inline size_t GetArraySize(const NDArray &arr) {
+static inline size_t GetArraySize(const NDArray& arr) {
   if (arr.IsMKLDNNData()) {
     return arr.GetMKLDNNData()->get_desc().get_size();
   }
@@ -251,7 +250,7 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
   }
 }
 
-template<typename T>
+template <typename T>
 static inline mkldnn::memory::data_type get_mkldnn_type() {
   return static_cast<mkldnn::memory::data_type>(data_type_enum<T>::type);
 }
@@ -260,12 +259,11 @@ static inline mkldnn_data_type_t get_mkldnn_type_t(int dtype) {
   return static_cast<mkldnn_data_type_t>(get_mkldnn_type(dtype));
 }
 
-template<typename T>
+template <typename T>
 static inline mkldnn_data_type_t get_mkldnn_type_t() {
   return static_cast<mkldnn_data_type_t>(data_type_enum<T>::type);
 }
 
-
 static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
   auto mkldnn_dtype = static_cast<mkldnn::memory::data_type>(dtype);
   switch (mkldnn_dtype) {
@@ -285,8 +283,9 @@ static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
   }
 }
 
-static inline size_t GetMemDescSize(const mkldnn::memory::desc &md) {
-  if (md.data.ndims == 0) return 0;
+static inline size_t GetMemDescSize(const mkldnn::memory::desc& md) {
+  if (md.data.ndims == 0)
+    return 0;
 
   size_t ret = 1;
   for (int i = 0; i < md.data.ndims; i++) {
@@ -297,19 +296,21 @@ static inline size_t GetMemDescSize(const mkldnn::memory::desc &md) {
   return ret;
 }
 
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int dtype = -1) {
+inline static mkldnn::memory::desc GetMemDesc(const NDArray& arr, int dtype = -1) {
   int ndim = arr.shape().ndim();
   mkldnn::memory::dims dims(ndim);
   dtype = (dtype == -1) ? arr.dtype() : dtype;
-  for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
+  for (size_t i = 0; i < dims.size(); i++)
+    dims[i] = arr.shape()[i];
   return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
 }
 
-inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray &arr, int dtype = -1) {
+inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray& arr, int dtype = -1) {
   int ndim = arr.shape().ndim();
   mkldnn::memory::dims dims(ndim);
   dtype = (dtype == -1) ? arr.dtype() : dtype;
-  for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
+  for (size_t i = 0; i < dims.size(); i++)
+    dims[i] = arr.shape()[i];
   auto format = mkldnn::memory::format_tag::any;
   // for batch 256 alexnet benchmark test
   if (dims.size() == 2) {
@@ -319,7 +320,7 @@ inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray &arr, int dtype
   return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), format};
 }
 
-inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
+inline static mkldnn::memory::desc GetWeightDesc(const NDArray& arr,
                                                  int num_groups,
                                                  bool quantized = false) {
   int dtype = quantized ? mshadow::kInt8 : arr.dtype();
@@ -340,25 +341,29 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
     switch (ndim) {
       case 3:
         tz = mkldnn::memory::dims{
-                num_groups, arr.shape()[N] / num_groups,
-                arr.shape()[C], arr.shape()[H]};
+            num_groups, arr.shape()[N] / num_groups, arr.shape()[C], arr.shape()[H]};
         break;
       case 4:
-        tz = mkldnn::memory::dims{
-                num_groups, arr.shape()[N] / num_groups,
-                arr.shape()[C], arr.shape()[H], arr.shape()[W]};
+        tz = mkldnn::memory::dims{num_groups,
+                                  arr.shape()[N] / num_groups,
+                                  arr.shape()[C],
+                                  arr.shape()[H],
+                                  arr.shape()[W]};
         break;
       case 5:
-        tz = mkldnn::memory::dims{
-                num_groups, arr.shape()[N] / num_groups,
-                arr.shape()[C], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
+        tz = mkldnn::memory::dims{num_groups,
+                                  arr.shape()[N] / num_groups,
+                                  arr.shape()[C],
+                                  arr.shape()[D],
+                                  arr.shape()[H],
+                                  arr.shape()[W]};
     }
     return mkldnn::memory::desc{tz, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
   }
 }
 
-inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray> &inputs) {
-  for (const auto &in : inputs) {
+inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray>& inputs) {
+  for (const auto& in : inputs) {
     if (in.IsView() && in.IsMKLDNNData()) {
       return true;
     }
@@ -381,7 +386,7 @@ typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
  */
 class TmpMemMgr {
   // This points to the memory buffer where we can allocate temp memory.
-  char *curr_mem;
+  char* curr_mem;
   // The total size of the temp memory.
   size_t mem_size;
   // This contains the current available memory size.
@@ -391,7 +396,7 @@ class TmpMemMgr {
   const size_t alignment = kMKLDNNAlign;
 
  public:
-  static TmpMemMgr *Get() {
+  static TmpMemMgr* Get() {
 #if DMLC_CXX11_THREAD_LOCAL
     static thread_local TmpMemMgr mgr;
 #else
@@ -407,44 +412,43 @@ class TmpMemMgr {
   }
 
   void Reset() {
-    curr_mem = nullptr;
+    curr_mem  = nullptr;
     curr_size = 0;
     // We don't reset est_size and mem_size because est_size contains the
     // estimated temp memory size from the last run and mem_size contains the
     // memroy size allocated in the last run.
   }
 
-  void Init(const Resource &r) {
+  void Init(const Resource& r) {
     // If the last time, if we estimate that we need more memory, we should the
     // larger memory size.
     mem_size = std::max(mem_size, est_size);
     if (mem_size > 0) {
-      // Let's allocate some extra memory. If we don't use some of them all the time,
-      // the OS won't physically allocate pages for them any way.
+      // Let's allocate some extra memory. If we don't use some of them all the
+      // time, the OS won't physically allocate pages for them any way.
       this->curr_size = mem_size * 2;
-      this->curr_mem = static_cast<char *>(r.get_host_space_internal(this->curr_size));
+      this->curr_mem  = static_cast<char*>(r.get_host_space_internal(this->curr_size));
     }
     // reset est_size, so we can start to estimate the temp memory size.
     this->est_size = 0;
   }
 
-  mkldnn::memory *Alloc(const mkldnn::memory::desc &md);
+  mkldnn::memory* Alloc(const mkldnn::memory::desc& md);
 };
 
 typedef std::unordered_map<int, mkldnn::memory> mkldnn_args_map_t;
 class MKLDNNStream {
-  std::vector<std::pair<mkldnn::primitive, mkldnn_args_map_t> > net_prim_args;
+  std::vector<std::pair<mkldnn::primitive, mkldnn_args_map_t>> net_prim_args;
   // Here we hold all memory related to the operators in the stream.
-  std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
+  std::vector<std::shared_ptr<const mkldnn::memory>> mem_holder;
   mkldnn::stream s;
 
  public:
-  static MKLDNNStream *Get();
+  static MKLDNNStream* Get();
 
-  MKLDNNStream(): s(CpuEngine::Get()->get_engine()) {}
+  MKLDNNStream() : s(CpuEngine::Get()->get_engine()) {}
 
-  void RegisterPrimArgs(const mkldnn::primitive &prim,
-                        const mkldnn_args_map_t &args) {
+  void RegisterPrimArgs(const mkldnn::primitive& prim, const mkldnn_args_map_t& args) {
     net_prim_args.emplace_back(prim, args);
   }
 
@@ -463,7 +467,7 @@ class MKLDNNStream {
    */
   void Submit(bool cleanup = true) {
     if (!net_prim_args.empty()) {
-      for (auto &v : net_prim_args) {
+      for (auto& v : net_prim_args) {
         v.first.execute(s, v.second);
       }
       net_prim_args.clear();
@@ -484,22 +488,22 @@ enum OutDataOp {
   AddBack,
 };
 
-typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
-void MKLDNNMemoryCopy(const mkldnn::memory &mem, const mkldnn::memory* this_mem);
+typedef std::pair<OutDataOp, mkldnn::memory*> mkldnn_output_t;
+void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem);
 
 /*
  * Here we want to get MKLDNN memory whose desc is exactly the same as
  * the given one. operator== can't guarantee that. == can return true even if
  * the formats are different. I need to double check its format.
  */
-static inline mkldnn::memory *GetMKLDNNExact(
-    const mkldnn::memory *mem, const mkldnn::memory::desc &desc) {
+static inline mkldnn::memory* GetMKLDNNExact(const mkldnn::memory* mem,
+                                             const mkldnn::memory::desc& desc) {
   mkldnn::memory::desc src_desc = mem->get_desc();
   if (desc == src_desc) {
-    return const_cast<mkldnn::memory *>(mem);
+    return const_cast<mkldnn::memory*>(mem);
   } else {
-    std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(
-            desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+    std::shared_ptr<mkldnn::memory> ret(
+        new mkldnn::memory(desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
     MKLDNNStream::Get()->RegisterMem(ret);
     return ret.get();
   }
@@ -516,27 +520,29 @@ static inline mkldnn::memory *GetMKLDNNExact(
  * If these two functions are used, we have to call CommitOutput to write
  * the output back to the output NDArray.
  */
-mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
-                                const mkldnn::memory::desc &desc,
-                                OpReqType req, const NDArray* in_arr = nullptr);
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
-                                       const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
+                                const mkldnn::memory::desc& desc,
+                                OpReqType req,
+                                const NDArray* in_arr = nullptr);
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
+                                       const mkldnn::memory::desc& desc,
                                        OpReqType req);
 /* This function has to be used with one of the functions above. */
-void CommitOutput(const NDArray &arr, const mkldnn_output_t &res);
+void CommitOutput(const NDArray& arr, const mkldnn_output_t& res);
 
-static inline void InvalidateOutputs(const std::vector<NDArray> &arrs,
-                                     const std::vector<OpReqType> &reqs) {
+static inline void InvalidateOutputs(const std::vector<NDArray>& arrs,
+                                     const std::vector<OpReqType>& reqs) {
   for (size_t i = 0; i < arrs.size(); i++) {
     if (reqs[i] == kWriteTo || reqs[i] == kNullOp) {
-      const_cast<NDArray &>(arrs[i]).InvalidateMKLDNNData();
+      const_cast<NDArray&>(arrs[i]).InvalidateMKLDNNData();
     }
   }
 }
 
-// TODO(alexzai): (MXNET-856) Remove helper function after subgraph feature added
-static inline void CreateDefaultInputs(const std::vector<NDArray> &arrs,
-                                       std::vector<NDArray> *out_arrs) {
+// TODO(alexzai): (MXNET-856) Remove helper function after subgraph feature
+// added
+static inline void CreateDefaultInputs(const std::vector<NDArray>& arrs,
+                                       std::vector<NDArray>* out_arrs) {
   out_arrs->clear();
   for (size_t i = 0; i < arrs.size(); ++i) {
     if (arrs[i].IsMKLDNNData())
@@ -546,20 +552,20 @@ static inline void CreateDefaultInputs(const std::vector<NDArray> &arrs,
   }
 }
 
-const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups);
+const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups);
 
-const mkldnn::memory *GetWeights(const NDArray &arr,
-                                 const mkldnn::memory::desc &target_md,
+const mkldnn::memory* GetWeights(const NDArray& arr,
+                                 const mkldnn::memory::desc& target_md,
                                  int num_groups);
 
-bool IsDefaultFormat(const mkldnn::memory::desc &desc);
-bool IsMKLDNN(const mkldnn::memory::desc &desc);
+bool IsDefaultFormat(const mkldnn::memory::desc& desc);
+bool IsMKLDNN(const mkldnn::memory::desc& desc);
 
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc &md);
+mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& md);
 mkldnn_format_tag_t GetDefaultFormat(int num_dims);
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc &md, const mkldnn_format_tag_t &format);
+mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& md, const mkldnn_format_tag_t& format);
 
-inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int ndims) {
+inline bool same_shape(const mxnet::TShape& shape, const mkldnn_dims_t dims, int ndims) {
   if (shape.ndim() != ndims)
     return false;
   for (int i = 0; i < ndims; i++)
@@ -568,8 +574,7 @@ inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int
   return true;
 }
 
-inline bool same_shape(const mkldnn::memory::desc &desc1,
-                       const mkldnn::memory::desc &desc2) {
+inline bool same_shape(const mkldnn::memory::desc& desc1, const mkldnn::memory::desc& desc2) {
   if (desc1.data.ndims != desc2.data.ndims)
     return false;
   for (int i = 0; i < desc1.data.ndims; i++)
@@ -578,10 +583,9 @@ inline bool same_shape(const mkldnn::memory::desc &desc1,
   return true;
 }
 
-inline bool same_shape(const mxnet::TShape &shape, int dtype,
-                       const mkldnn::memory::desc &desc) {
-  return same_shape(shape, desc.data.dims, desc.data.ndims)
-      && get_mkldnn_type(dtype) == desc.data.data_type;
+inline bool same_shape(const mxnet::TShape& shape, int dtype, const mkldnn::memory::desc& desc) {
+  return same_shape(shape, desc.data.dims, desc.data.ndims) &&
+         get_mkldnn_type(dtype) == desc.data.data_type;
 }
 
 /*
@@ -592,25 +596,24 @@ inline bool same_shape(const mxnet::TShape &shape, int dtype,
 class MKLDNNMemory {
   std::shared_ptr<mkldnn::memory> mem;
   mkldnn::memory::desc desc;
-  size_t size;      // The number of bytes.
+  size_t size;  // The number of bytes.
 
  public:
-  MKLDNNMemory(mkldnn::memory::desc md, void *addr): desc(md) {
+  MKLDNNMemory(mkldnn::memory::desc md, void* addr) : desc(md) {
     mem.reset(new mkldnn::memory(md, CpuEngine::Get()->get_engine(), addr));
     size = desc.get_size();
   }
 
-  explicit MKLDNNMemory(std::shared_ptr<mkldnn::memory> mem): desc(
-      mem->get_desc()) {
+  explicit MKLDNNMemory(std::shared_ptr<mkldnn::memory> mem) : desc(mem->get_desc()) {
     this->mem = mem;
-    size = desc.get_size();
+    size      = desc.get_size();
   }
 
-  void SetDataHandle(void *handle) {
+  void SetDataHandle(void* handle) {
     mem->set_data_handle(handle);
   }
 
-  void *GetDataHandle() const {
+  void* GetDataHandle() const {
     return mem->get_data_handle();
   }
 
@@ -618,7 +621,7 @@ class MKLDNNMemory {
     return mem;
   }
 
-  mkldnn::memory *GetRaw() const {
+  mkldnn::memory* GetRaw() const {
     return mem.get();
   }
 
@@ -630,13 +633,15 @@ class MKLDNNMemory {
     return mem->get_desc();
   }
 
-  mkldnn::memory::desc GetDesc(mkldnn_format_tag_t format,
-          mkldnn::memory::data_type data_type = mkldnn::memory::data_type::undef) const {
+  mkldnn::memory::desc GetDesc(
+      mkldnn_format_tag_t format,
+      mkldnn::memory::data_type data_type = mkldnn::memory::data_type::undef) const {
     mkldnn::memory::dims dims(desc.data.dims, desc.data.dims + desc.data.ndims);
-    mkldnn::memory::data_type cpp_type = (data_type == mkldnn::memory::data_type::undef)
-                        ? static_cast<mkldnn::memory::data_type>(desc.data.data_type) : data_type;
-    mkldnn::memory::desc data_md(dims, cpp_type,
-        static_cast<mkldnn::memory::format_tag>(format));
+    mkldnn::memory::data_type cpp_type =
+        (data_type == mkldnn::memory::data_type::undef)
+            ? static_cast<mkldnn::memory::data_type>(desc.data.data_type)
+            : data_type;
+    mkldnn::memory::desc data_md(dims, cpp_type, static_cast<mkldnn::memory::format_tag>(format));
     return data_md;
   }
 
@@ -652,25 +657,26 @@ class MKLDNNMemory {
     return mem->get_desc() == md;
   }
 
-  bool SameFormat(const mxnet::TShape &shape, int dtype) const {
+  bool SameFormat(const mxnet::TShape& shape, int dtype) const {
     return same_shape(shape, dtype, desc);
   }
 
-  void ReorderTo(mkldnn::memory *other) const {
+  void ReorderTo(mkldnn::memory* other) const {
     mkldnn::stream s(CpuEngine::Get()->get_engine());
     mkldnn::reorder(*mem, *other).execute(s, *mem, *other);
   }
 };
 
 // reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst);
+void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst);
 
 template <typename Compute, typename AttrState>
-void FallBackCompute(Compute fn, const AttrState &attrs,
-                     const OpContext &ctx,
-                     const std::vector<NDArray> &inputs,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &outputs);
+void FallBackCompute(Compute fn,
+                     const AttrState& attrs,
+                     const OpContext& ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs);
 
 /*
  * This class is used to check the correctness of MKLDNN operators.
@@ -683,66 +689,69 @@ class OpCheck {
 
  public:
   OpCheck(bool backward, size_t num_checks) {
-    this->backward = backward;
+    this->backward   = backward;
     this->num_checks = num_checks;
   }
 
-  void Init(const std::vector<mxnet::NDArray> &inputs_,
-          const std::vector<mxnet::NDArray> &outputs_);
+  void Init(const std::vector<mxnet::NDArray>& inputs_,
+            const std::vector<mxnet::NDArray>& outputs_);
 
-  void Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
-           const mxnet::OpContext &ctx,
-           const std::vector<mxnet::NDArray> &inputs_,
-           const std::vector<mxnet::OpReqType> &req,
-           const std::vector<mxnet::NDArray> &outputs_);
+  void Run(mxnet::FCompute fn,
+           const nnvm::NodeAttrs& attrs,
+           const mxnet::OpContext& ctx,
+           const std::vector<mxnet::NDArray>& inputs_,
+           const std::vector<mxnet::OpReqType>& req,
+           const std::vector<mxnet::NDArray>& outputs_);
 
-  void CopyResult(const std::vector<mxnet::NDArray> &outputs_,
-                  const std::vector<size_t>& indice);
+  void CopyResult(const std::vector<mxnet::NDArray>& outputs_, const std::vector<size_t>& indice);
 };
 
-bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
+bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
                        const int dev_mask,
                        bool support_mkldnn,
-                       DispatchMode *dispatch_mode,
-                       std::vector<int> *in_attrs,
-                       std::vector<int> *out_attrs);
-
-#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs)  \
-    static bool debug = dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false);  \
-    OpCheck check(backward, num_checks);                            \
-    if (debug) check.Init(inputs, outputs);
-
-#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs)    \
-    if (debug) check.Run(fn, attrs, ctx, inputs, req, outputs);
+                       DispatchMode* dispatch_mode,
+                       std::vector<int>* in_attrs,
+                       std::vector<int>* out_attrs);
+
+#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
+  static bool debug = dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false);   \
+  OpCheck check(backward, num_checks);                             \
+  if (debug)                                                       \
+    check.Init(inputs, outputs);
+
+#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
+  if (debug)                                                     \
+    check.Run(fn, attrs, ctx, inputs, req, outputs);
 #define MKLDNN_OPCHECK_COPY_RESULT(outputs, indice) \
-    if (debug) check.CopyResult(outputs, indice);
+  if (debug)                                        \
+    check.CopyResult(outputs, indice);
 
 struct MKLDNNPostEltwiseParam {
   mkldnn::algorithm alg = mkldnn::algorithm::undef;
-  float scale = 1.f;
-  float alpha = 0.f;
-  float beta = 1.f;
+  float scale           = 1.f;
+  float alpha           = 0.f;
+  float beta            = 1.f;
 };
 
 void MKLDNNRun(mxnet::FComputeEx fn,
-               const nnvm::NodeAttrs &attrs,
-               const mxnet::OpContext &ctx,
-               const std::vector<mxnet::NDArray> &inputs_,
-               const std::vector<mxnet::OpReqType> &req,
-               const std::vector<mxnet::NDArray> &outputs_);
-
-using FComputeExUnary = std::function<void (const nnvm::NodeAttrs& attrs,
-                                       const OpContext& ctx,
-                                       const NDArray& input,
-                                       const OpReqType& req,
-                                       const NDArray& output)>;
+               const nnvm::NodeAttrs& attrs,
+               const mxnet::OpContext& ctx,
+               const std::vector<mxnet::NDArray>& inputs_,
+               const std::vector<mxnet::OpReqType>& req,
+               const std::vector<mxnet::NDArray>& outputs_);
+
+using FComputeExUnary = std::function<void(const nnvm::NodeAttrs& attrs,
+                                           const OpContext& ctx,
+                                           const NDArray& input,
+                                           const OpReqType& req,
+                                           const NDArray& output)>;
 
 void MKLDNNRun(FComputeExUnary fn,
-               const nnvm::NodeAttrs &attrs,
-               const mxnet::OpContext &ctx,
-               const mxnet::NDArray &inputs_,
-               const mxnet::OpReqType &req,
-               const mxnet::NDArray &outputs_);
+               const nnvm::NodeAttrs& attrs,
+               const mxnet::OpContext& ctx,
+               const mxnet::NDArray& inputs_,
+               const mxnet::OpReqType& req,
+               const mxnet::NDArray& outputs_);
 
 }  // namespace mxnet
 #endif
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 0cea4ef..5a65c94 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -20,14 +20,15 @@
 #if MXNET_USE_MKLDNN == 1
 
 #include <atomic>
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+
 #include "../../../common/exec_utils.h"
 #include "../../operator_common.h"
+#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
 
 namespace mxnet {
 
-MKLDNNStream *MKLDNNStream::Get() {
+MKLDNNStream* MKLDNNStream::Get() {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local MKLDNNStream stream;
 #else
@@ -36,7 +37,7 @@ MKLDNNStream *MKLDNNStream::Get() {
   return &stream;
 }
 
-void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
+void* AlignMem(void* mem, size_t size, size_t alignment, size_t* space) {
   if (size > *space)
     return nullptr;
   intptr_t addr = reinterpret_cast<intptr_t>(mem);
@@ -51,13 +52,13 @@ void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
   addr += padding;
   *space -= padding;
   CHECK_EQ(addr % alignment, 0);
-  return reinterpret_cast<void *>(addr);
+  return reinterpret_cast<void*>(addr);
 }
 
-mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
+mkldnn::memory* TmpMemMgr::Alloc(const mkldnn::memory::desc& md) {
   // We need to include the size of the memory used for alignment.
   this->est_size += md.get_size() + alignment;
-  void *mem = AlignMem(this->curr_mem, md.get_size(), alignment, &this->curr_size);
+  void* mem = AlignMem(this->curr_mem, md.get_size(), alignment, &this->curr_size);
   if (mem) {
     // The memory is allocated from the temporary memory space in the
     // operator. It'll only become invalid after we exit from the operator.
@@ -65,19 +66,20 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
     MKLDNNStream::Get()->RegisterMem(ret);
     CHECK_EQ(mem, mem);
     this->curr_size -= md.get_size();
-    this->curr_mem = static_cast<char *>(mem) + md.get_size();
+    this->curr_mem = static_cast<char*>(mem) + md.get_size();
     return ret.get();
   } else {
-    // If curr_mem has been initialized and we still reach here, it means the current
-    // allocated memory isn't enough. But it doesn't matter for multiple invokes of a
-    // operator, as the TmpMemMgr could estimate the space at the first iteration and
-    // then re-requests abundant space from MXNet resource. MKL-DNN could allocate
-    // the space by itself. Thus, we just let it continue for estimating the maximum
-    // required space size. It will be allocated at next call.
+    // If curr_mem has been initialized and we still reach here, it means the
+    // current allocated memory isn't enough. But it doesn't matter for multiple
+    // invokes of a operator, as the TmpMemMgr could estimate the space at the
+    // first iteration and then re-requests abundant space from MXNet resource.
+    // MKL-DNN could allocate the space by itself. Thus, we just let it continue
+    // for estimating the maximum required space size. It will be allocated at
+    // next call.
     if (this->curr_mem && dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false)) {
       LOG(WARNING) << "mkl-dnn debug message: The rest of the temporary space is not "
-          << "adequate for allocating " << md.get_size() << " bytes. Thus, mkl-dnn "
-          << "allocate the space by itself.";
+                   << "adequate for allocating " << md.get_size() << " bytes. Thus, mkl-dnn "
+                   << "allocate the space by itself.";
     }
     mkldnn_mem_ptr ret(new mkldnn::memory(md, CpuEngine::Get()->get_engine()));
     MKLDNNStream::Get()->RegisterMem(ret);
@@ -85,97 +87,93 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
   }
 }
 
-void MKLDNNMemoryCopy(const mkldnn::memory &mem, const mkldnn::memory* this_mem) {
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  mkldnn::memory::desc from_desc = mem.get_desc();
-  mkldnn::memory::desc this_desc = this_mem->get_desc();
+void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem) {
+  MKLDNNStream* stream                = MKLDNNStream::Get();
+  mkldnn::memory::desc from_desc      = mem.get_desc();
+  mkldnn::memory::desc this_desc      = this_mem->get_desc();
   mkldnn_format_tag_t from_def_format = GetDefaultFormat(from_desc);
   mkldnn_format_tag_t this_def_format = GetDefaultFormat(this_desc);
 
   if (!same_shape(this_desc, from_desc) && IsDefaultFormat(from_desc)) {
     // In this case, we can simply create a new MKLDNN memory for the required
     // shape.
-    mkldnn::memory::dims dims(this_desc.data.dims,
-                              this_desc.data.dims + this_desc.data.ndims);
+    mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims);
     auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
-    mkldnn::memory::desc data_md(dims, this_dtype,
-                                 static_cast<mkldnn::memory::format_tag>(this_def_format));
+    mkldnn::memory::desc data_md(
+        dims, this_dtype, static_cast<mkldnn::memory::format_tag>(this_def_format));
 
     mkldnn_mem_ptr tmp_mem(new mkldnn::memory(data_md, mem.get_engine(), mem.get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *tmp_mem},
-                                                 {MKLDNN_ARG_TO, *this_mem}});
+    std::unordered_map<int, mkldnn::memory> args(
+        {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
     stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
   } else if (!same_shape(this_desc, from_desc)) {
     // In this case, the source memory stores data in a customized layout. We
     // need to reorganize the data in memory before we can reshape.
     mkldnn::memory::desc def_desc = GetDesc(from_desc, from_def_format);
-    mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_desc);
-    std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
-                                                 {MKLDNN_ARG_TO, *def_mem}});
+    mkldnn::memory* def_mem       = TmpMemMgr::Get()->Alloc(def_desc);
+    std::unordered_map<int, mkldnn::memory> args(
+        {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *def_mem}});
     stream->RegisterPrimArgs(mkldnn::reorder(mem, *def_mem), args);
 
     // Now we can reshape it
-    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(this_desc,
-        mem.get_engine(), def_mem->get_data_handle()));
+    mkldnn_mem_ptr tmp_mem(
+        new mkldnn::memory(this_desc, mem.get_engine(), def_mem->get_data_handle()));
     stream->RegisterMem(tmp_mem);
     args = {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}};
     stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
-} else if (this_desc == from_desc) {
-    std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
-                                                 {MKLDNN_ARG_TO, *this_mem}});
+  } else if (this_desc == from_desc) {
+    std::unordered_map<int, mkldnn::memory> args(
+        {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
     // If the layout is the same, we can just copy data.
     stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
-} else {
+  } else {
     // If both are not using the default layouts. There isn't much we can do,
     // other than reorder data layout directly.
     if (!IsDefaultFormat(this_desc) && !IsDefaultFormat(from_desc)) {
-      std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
-                                                   {MKLDNN_ARG_TO, *this_mem}});
+      std::unordered_map<int, mkldnn::memory> args(
+          {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
       stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
     } else if (IsDefaultFormat(this_desc)) {
       // If the dest mem uses the default memory layout, we can simply use
       // the default format of the source memory to improve perf of reorder.
       mkldnn::memory::desc desc = GetDesc(from_desc, from_def_format);
-      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(desc,
-          mem.get_engine(), this_mem->get_data_handle()));
+      mkldnn_mem_ptr tmp_mem(
+          new mkldnn::memory(desc, mem.get_engine(), this_mem->get_data_handle()));
       stream->RegisterMem(tmp_mem);
-      std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
-                                                   {MKLDNN_ARG_TO, *tmp_mem}});
+      std::unordered_map<int, mkldnn::memory> args(
+          {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *tmp_mem}});
       stream->RegisterPrimArgs(mkldnn::reorder(mem, *tmp_mem), args);
     } else {
       // If the src mem uses the default memory layout, we can use
       // the default format of the source memory to improve perf.
       mkldnn::memory::desc desc = GetDesc(this_desc, this_def_format);
-      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(desc,
-          this_mem->get_engine(), mem.get_data_handle()));
+      mkldnn_mem_ptr tmp_mem(
+          new mkldnn::memory(desc, this_mem->get_engine(), mem.get_data_handle()));
       stream->RegisterMem(tmp_mem);
-      std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *tmp_mem},
-                                                   {MKLDNN_ARG_TO, *this_mem}});
+      std::unordered_map<int, mkldnn::memory> args(
+          {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
       stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
     }
   }
 }
 
-bool CanWriteTo(const NDArray &out_arr,
-                const NDArray &in_arr,
-                const mkldnn::memory::desc &desc) {
-  auto in_mem = in_arr.GetMKLDNNData();
-  bool add_same = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
-  bool pdesc_same = out_arr.GetMKLDNNData()->get_desc() == desc &&
-      in_mem->get_desc() == desc;
+bool CanWriteTo(const NDArray& out_arr, const NDArray& in_arr, const mkldnn::memory::desc& desc) {
+  auto in_mem     = in_arr.GetMKLDNNData();
+  bool add_same   = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
+  bool pdesc_same = out_arr.GetMKLDNNData()->get_desc() == desc && in_mem->get_desc() == desc;
   return add_same && pdesc_same;
 }
 
-mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
-                                const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
+                                const mkldnn::memory::desc& desc,
                                 OpReqType req,
                                 const NDArray* in_arr) {
   if (kAddTo == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
     return mkldnn_output_t(OutDataOp::AddBack, tmp);
   } else if (kWriteInplace == req && in_arr != nullptr && CanWriteTo(out_arr, *in_arr, desc)) {
-    mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+    mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
     // mem is nullptr if out_arr is view and desc is MKLDNN format.
     // need to Reorder2Default before calling CreateMKLDNNMem
     CHECK(mem != nullptr);
@@ -184,7 +182,7 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
     return mkldnn_output_t(OutDataOp::CopyBack, tmp);
   } else if (kWriteTo == req) {
-    mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+    mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
     if (nullptr == mem) {
       auto tmp = TmpMemMgr::Get()->Alloc(desc);
       return mkldnn_output_t(OutDataOp::CopyBack, tmp);
@@ -195,8 +193,8 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
   return mkldnn_output_t(OutDataOp::Noop, tmp);
 }
 
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
-                                       const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
+                                       const mkldnn::memory::desc& desc,
                                        OpReqType req) {
   if (kAddTo == req) {
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
@@ -205,9 +203,9 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
     auto tmp = TmpMemMgr::Get()->Alloc(desc);
     return mkldnn_output_t(OutDataOp::CopyBack, tmp);
   } else {
-    mkldnn::memory *mem = nullptr;
+    mkldnn::memory* mem = nullptr;
     if (IsDefaultFormat(desc)) {
-      mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+      mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
     }
     if (mem == nullptr) {
       auto tmp = TmpMemMgr::Get()->Alloc(desc);
@@ -218,29 +216,29 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
   }
 }
 
-void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
+void CommitOutput(const NDArray& arr, const mkldnn_output_t& res) {
   if (res.first == CopyBack) {
-    const_cast<NDArray &>(arr).CopyFrom(*res.second);
+    const_cast<NDArray&>(arr).CopyFrom(*res.second);
   } else if (res.first == AddBack) {
     auto res_memory = res.second;
-    auto target_pd = arr.GetMKLDNNData()->get_desc();
-    auto mem = arr.GetMKLDNNData(res.second->get_desc());
+    auto target_pd  = arr.GetMKLDNNData()->get_desc();
+    auto mem        = arr.GetMKLDNNData(res.second->get_desc());
     if (mem == nullptr) {
       auto tmp_memory = TmpMemMgr::Get()->Alloc(target_pd);
       MKLDNNMemoryCopy(*res_memory, tmp_memory);
       res_memory = tmp_memory;
-      mem = arr.GetMKLDNNData();
+      mem        = arr.GetMKLDNNData();
     }
     op::MKLDNNSum(*mem, *res_memory, *mem);
   }
 }
 
-const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
+const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups) {
   const auto type = get_mkldnn_type(arr.dtype());
-  auto tz = mkldnn::memory::dims{0};
+  auto tz         = mkldnn::memory::dims{0};
   auto format_tag = mkldnn::memory::format_tag::undef;
-  auto engine = CpuEngine::Get()->get_engine();
-  const int ndim = arr.shape().ndim();
+  auto engine     = CpuEngine::Get()->get_engine();
+  const int ndim  = arr.shape().ndim();
   int O = 0, I = 1, H = 2, W = 3;
   int D = -1;
   if (ndim == 5) {
@@ -249,35 +247,38 @@ const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
     W = 4;
   }
   if (ndim == 2) {
-    tz = mkldnn::memory::dims{arr.shape()[O], arr.shape()[I]};
+    tz         = mkldnn::memory::dims{arr.shape()[O], arr.shape()[I]};
     format_tag = mkldnn::memory::format_tag::oi;
   } else if (ndim == 3) {
-    tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
-                                    arr.shape()[I], arr.shape()[H]}
-             : mkldnn::memory::dims{arr.shape()[O],
-                                    arr.shape()[I], arr.shape()[H]};
-    format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goiw
-                                : mkldnn::memory::format_tag::oiw;
+    tz = num_groups > 1 ? mkldnn::memory::dims{num_groups,
+                                               arr.shape()[O] / num_groups,
+                                               arr.shape()[I],
+                                               arr.shape()[H]}
+                        : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
+    format_tag =
+        num_groups > 1 ? mkldnn::memory::format_tag::goiw : mkldnn::memory::format_tag::oiw;
   } else if (ndim == 4) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
-                                    arr.shape()[I], arr.shape()[H],
+             ? mkldnn::memory::dims{num_groups,
+                                    arr.shape()[O] / num_groups,
+                                    arr.shape()[I],
+                                    arr.shape()[H],
                                     arr.shape()[W]}
-             : mkldnn::memory::dims{
-                   arr.shape()[O], arr.shape()[I],  arr.shape()[H], arr.shape()[W]};
-    format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goihw
-                                : mkldnn::memory::format_tag::oihw;
+             : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
+    format_tag =
+        num_groups > 1 ? mkldnn::memory::format_tag::goihw : mkldnn::memory::format_tag::oihw;
   } else if (ndim == 5) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
-                                    arr.shape()[I], arr.shape()[D],
-                                    arr.shape()[H], arr.shape()[W]}
+             ? mkldnn::memory::dims{num_groups,
+                                    arr.shape()[O] / num_groups,
+                                    arr.shape()[I],
+                                    arr.shape()[D],
+                                    arr.shape()[H],
+                                    arr.shape()[W]}
              : mkldnn::memory::dims{
-                   arr.shape()[O], arr.shape()[I], arr.shape()[D],
-                   arr.shape()[H], arr.shape()[W]};
-    format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goidhw
-                                : mkldnn::memory::format_tag::oidhw;
+                   arr.shape()[O], arr.shape()[I], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
+    format_tag =
+        num_groups > 1 ? mkldnn::memory::format_tag::goidhw : mkldnn::memory::format_tag::oidhw;
   } else {
     LOG(FATAL) << "The weight array has an unsupported number of dimensions";
   }
@@ -285,37 +286,41 @@ const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
   return arr.GetMKLDNNData(md);
 }
 
-const mkldnn::memory *GetWeights(const NDArray &arr,
-                                 const mkldnn::memory::desc &target_desc, int num_groups) {
-  const mkldnn::memory *mem = arr.GetMKLDNNData(target_desc);
-  // If the weight array already uses the target layout, simply return it directly.
-  if (mem) return mem;
+const mkldnn::memory* GetWeights(const NDArray& arr,
+                                 const mkldnn::memory::desc& target_desc,
+                                 int num_groups) {
+  const mkldnn::memory* mem = arr.GetMKLDNNData(target_desc);
+  // If the weight array already uses the target layout, simply return it
+  // directly.
+  if (mem)
+    return mem;
   mem = GetWeights(arr, num_groups);
-  if (mem == nullptr) mem = arr.GetMKLDNNDataReorder(target_desc);
-  if (mem->get_desc() == target_desc) return mem;
+  if (mem == nullptr)
+    mem = arr.GetMKLDNNDataReorder(target_desc);
+  if (mem->get_desc() == target_desc)
+    return mem;
 
   auto ret = TmpMemMgr::Get()->Alloc(target_desc);
-  std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem},
-                                               {MKLDNN_ARG_TO, *ret}});
+  std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
   MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
   return ret;
 }
 
-
 // default: block and dims' stride increase monotonically
-// mkldnn: 1.winograd 2.rnn packed 3. block and dims'stride is not increase monotonically
-bool IsMKLDNN(const mkldnn::memory::desc &desc) {
+// mkldnn: 1.winograd 2.rnn packed 3. block and dims'stride is not increase
+// monotonically
+bool IsMKLDNN(const mkldnn::memory::desc& desc) {
   bool rslt = true;
   if (desc.data.format_kind == mkldnn_blocked) {
     if (desc.data.format_desc.blocking.inner_nblks == 0) {
       int i = 0;
-      for (i = 0; i < desc.data.ndims-1; i++) {
-        if (desc.data.format_desc.blocking.strides[i]
-            < desc.data.format_desc.blocking.strides[i + 1]) {
+      for (i = 0; i < desc.data.ndims - 1; i++) {
+        if (desc.data.format_desc.blocking.strides[i] <
+            desc.data.format_desc.blocking.strides[i + 1]) {
           break;
         }
       }
-      if (i == desc.data.ndims-1) {
+      if (i == desc.data.ndims - 1) {
         rslt = false;
       }
     }
@@ -325,34 +330,40 @@ bool IsMKLDNN(const mkldnn::memory::desc &desc) {
 
 mkldnn_format_tag_t GetDefaultFormat(int num_dims) {
   switch (num_dims) {
-    case 1: return mkldnn_a;
-    case 2: return mkldnn_ab;
-    case 3: return mkldnn_abc;
-    case 4: return mkldnn_abcd;
-    case 5: return mkldnn_abcde;
-    case 6: return mkldnn_abcdef;
+    case 1:
+      return mkldnn_a;
+    case 2:
+      return mkldnn_ab;
+    case 3:
+      return mkldnn_abc;
+    case 4:
+      return mkldnn_abcd;
+    case 5:
+      return mkldnn_abcde;
+    case 6:
+      return mkldnn_abcdef;
     default:
       LOG(FATAL) << "Not implemented dimension (" << num_dims << ") for MKLDNN";
       return mkldnn_format_tag_undef;
   }
 }
 
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
+mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& desc) {
   return GetDefaultFormat(desc.data.ndims);
 }
 
-bool IsDefaultFormat(const mkldnn::memory::desc &desc) {
+bool IsDefaultFormat(const mkldnn::memory::desc& desc) {
   bool rslt = false;
   if (desc.data.format_kind == mkldnn_blocked) {
     if (desc.data.format_desc.blocking.inner_nblks == 0) {
       int i = 0;
-      for (i = 0; i < desc.data.ndims-1; i++) {
-        if (desc.data.format_desc.blocking.strides[i]
-            < desc.data.format_desc.blocking.strides[i + 1]) {
+      for (i = 0; i < desc.data.ndims - 1; i++) {
+        if (desc.data.format_desc.blocking.strides[i] <
+            desc.data.format_desc.blocking.strides[i + 1]) {
           break;
         }
       }
-      if (i == desc.data.ndims-1) {
+      if (i == desc.data.ndims - 1) {
         rslt = true;
       }
     }
@@ -360,20 +371,18 @@ bool IsDefaultFormat(const mkldnn::memory::desc &desc) {
   return rslt;
 }
 
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc &desc,
-                             const mkldnn_format_tag_t &format) {
+mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& desc, const mkldnn_format_tag_t& format) {
   mkldnn::memory::dims dims(desc.data.ndims);
   for (size_t i = 0; i < dims.size(); i++)
     dims[i] = desc.data.dims[i];
   mkldnn::memory::format_tag cpp_format = static_cast<mkldnn::memory::format_tag>(format);
-  mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
-      desc.data.data_type);
+  mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(desc.data.data_type);
   mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
   return mkldnn::memory::desc(dims, cpp_type, cpp_format);
 }
 
 // reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst) {
+void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst) {
   mkldnn::stream s(CpuEngine::Get()->get_engine());
   auto new_src = *src;
   auto new_dst = *dst;
@@ -381,11 +390,12 @@ void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst) {
 }
 
 template <typename Compute, typename AttrState>
-void FallBackCompute(Compute fn, const AttrState &attrs_states,
-                     const OpContext &ctx,
-                     const std::vector<NDArray> &inputs,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<NDArray> &outputs) {
+void FallBackCompute(Compute fn,
+                     const AttrState& attrs_states,
+                     const OpContext& ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs) {
   std::vector<TBlob> in_blobs(inputs.size());
   std::vector<NDArray> in_bufs;
   std::vector<OpReqType> new_req = req;
@@ -427,7 +437,7 @@ void FallBackCompute(Compute fn, const AttrState &attrs_states,
       // ensure output does not use mkldnn mem.
       // for inplace, we already converted & copied input above.
       if ((req[i] == kWriteTo) || (req[i] == kWriteInplace)) {
-        const_cast<NDArray &>(output).InvalidateMKLDNNData();
+        const_cast<NDArray&>(output).InvalidateMKLDNNData();
         if (req[i] == kWriteInplace) {
           new_req[i] = kWriteTo;
         }
@@ -454,18 +464,20 @@ void FallBackCompute(Compute fn, const AttrState &attrs_states,
   }
 }
 
-template<typename DType>
-void print_diff(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2) {
-  DType *data1 = reinterpret_cast<DType *>(arr1.data().dptr_);
-  DType *data2 = reinterpret_cast<DType *>(arr2.data().dptr_);
+template <typename DType>
+void print_diff(const mxnet::NDArray& arr1, const mxnet::NDArray& arr2) {
+  DType* data1 = reinterpret_cast<DType*>(arr1.data().dptr_);
+  DType* data2 = reinterpret_cast<DType*>(arr2.data().dptr_);
   for (size_t i = 0; i < arr1.shape().Size(); i++)
     std::cout << data1[i] - data2[i] << ", ";
   std::cout << std::endl;
 }
 
-template<typename DType>
-static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
-                         DType rtol, DType atol) {
+template <typename DType>
+static bool SimilarArray(const mxnet::NDArray& arr1,
+                         const mxnet::NDArray& arr2,
+                         DType rtol,
+                         DType atol) {
   if (arr1.shape().Size() != arr2.shape().Size())
     return false;
 
@@ -476,21 +488,21 @@ static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
   // But we shouldn't reorder data in the original array.
   NDArray buf1, buf2;
   if (arr1.IsMKLDNNData()) {
-    buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
+    buf1     = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
     auto mem = arr1.GetMKLDNNData();
     buf1.CopyFrom(*mem);
   }
   if (arr2.IsMKLDNNData()) {
-    buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
+    buf2     = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
     auto mem = arr2.GetMKLDNNData();
     buf2.CopyFrom(*mem);
   }
   MKLDNNStream::Get()->Submit();
 
-  DType *data1 = reinterpret_cast<DType *>(
-      arr1.IsMKLDNNData() ? buf1.data().dptr_: arr1.data().dptr_);
-  DType *data2 = reinterpret_cast<DType *>(
-      arr2.IsMKLDNNData() ? buf2.data().dptr_: arr2.data().dptr_);
+  DType* data1 =
+      reinterpret_cast<DType*>(arr1.IsMKLDNNData() ? buf1.data().dptr_ : arr1.data().dptr_);
+  DType* data2 =
+      reinterpret_cast<DType*>(arr2.IsMKLDNNData() ? buf2.data().dptr_ : arr2.data().dptr_);
   std::atomic<bool> success(true);
 #pragma omp parallel for
 #ifdef _MSC_VER
@@ -505,39 +517,42 @@ static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
   return success.load();
 }
 
-template void FallBackCompute(void (*)(nnvm::NodeAttrs const &, OpContext const &,
-                                       std::vector<TBlob, std::allocator<TBlob> > const &,
-                                       std::vector<OpReqType, std::allocator<OpReqType> > const &,
-                                       std::vector<TBlob, std::allocator<TBlob> > const &),
-                              nnvm::NodeAttrs const &, OpContext const &,
-                              std::vector<NDArray, std::allocator<NDArray> > const &,
-                              std::vector<OpReqType, std::allocator<OpReqType> > const &,
-                              std::vector<NDArray, std::allocator<NDArray> > const &);
-
-template void FallBackCompute(void (*)(OpStatePtr const &, OpContext const &,
-                                       std::vector<TBlob, std::allocator<TBlob> > const &,
-                                       std::vector<OpReqType, std::allocator<OpReqType> > const &,
-                                       std::vector<TBlob, std::allocator<TBlob> > const &),
-                              OpStatePtr const &, OpContext const &,
-                              std::vector<NDArray, std::allocator<NDArray> > const &,
-                              std::vector<OpReqType, std::allocator<OpReqType> > const &,
-                              std::vector<NDArray, std::allocator<NDArray> > const &);
-
-void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
-                   const std::vector<mxnet::NDArray> &outputs_) {
+template void FallBackCompute(void (*)(nnvm::NodeAttrs const&,
+                                       OpContext const&,
+                                       std::vector<TBlob, std::allocator<TBlob>> const&,
+                                       std::vector<OpReqType, std::allocator<OpReqType>> const&,
+                                       std::vector<TBlob, std::allocator<TBlob>> const&),
+                              nnvm::NodeAttrs const&,
+                              OpContext const&,
+                              std::vector<NDArray, std::allocator<NDArray>> const&,
+                              std::vector<OpReqType, std::allocator<OpReqType>> const&,
+                              std::vector<NDArray, std::allocator<NDArray>> const&);
+
+template void FallBackCompute(void (*)(OpStatePtr const&,
+                                       OpContext const&,
+                                       std::vector<TBlob, std::allocator<TBlob>> const&,
+                                       std::vector<OpReqType, std::allocator<OpReqType>> const&,
+                                       std::vector<TBlob, std::allocator<TBlob>> const&),
+                              OpStatePtr const&,
+                              OpContext const&,
+                              std::vector<NDArray, std::allocator<NDArray>> const&,
+                              std::vector<OpReqType, std::allocator<OpReqType>> const&,
+                              std::vector<NDArray, std::allocator<NDArray>> const&);
+
+void OpCheck::Init(const std::vector<mxnet::NDArray>& inputs_,
+                   const std::vector<mxnet::NDArray>& outputs_) {
   auto ctx = inputs_[0].ctx();
   CHECK(!MKLDNNStream::Get()->HasOps());
   for (size_t i = 0; i < inputs_.size(); i++) {
     NDArray data = inputs_[i];
     inputs.emplace_back(data.shape(), ctx, false, data.dtype());
     if (data.IsMKLDNNData() && data.IsView())
-        data = data.Reorder2Default();
+      data = data.Reorder2Default();
     auto mem = data.GetMKLDNNData();
     inputs[i].CopyFrom(*mem);
   }
   for (size_t i = 0; i < outputs_.size(); i++) {
-    outputs.emplace_back(outputs_[i].shape(), ctx,
-                         false, outputs_[i].dtype());
+    outputs.emplace_back(outputs_[i].shape(), ctx, false, outputs_[i].dtype());
     if (backward) {
       auto mem = outputs_[i].GetMKLDNNData();
       outputs[i].CopyFrom(*mem);
@@ -546,18 +561,20 @@ void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
   MKLDNNStream::Get()->Submit();
 }
 
-void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
-                  const mxnet::OpContext &ctx,
-                  const std::vector<mxnet::NDArray> &inputs_,
-                  const std::vector<mxnet::OpReqType> &req,
-                  const std::vector<mxnet::NDArray> &outputs_) {
+void OpCheck::Run(mxnet::FCompute fn,
+                  const nnvm::NodeAttrs& attrs,
+                  const mxnet::OpContext& ctx,
+                  const std::vector<mxnet::NDArray>& inputs_,
+                  const std::vector<mxnet::OpReqType>& req,
+                  const std::vector<mxnet::NDArray>& outputs_) {
   static auto& is_excluded = Op::GetAttr<bool>("TExcludeMKLDNNDebug");
   if (is_excluded.get(attrs.op, false)) {
     LOG(WARNING) << attrs.op->name << " not checked. TExcludeMKLDNNDebug flag present";
     return;
   }
   std::vector<mxnet::TBlob> in_blobs(inputs.size());
-  for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data();
+  for (size_t i = 0; i < in_blobs.size(); i++)
+    in_blobs[i] = inputs[i].data();
   std::vector<mxnet::TBlob> out_blobs(outputs.size());
   for (size_t i = 0; i < out_blobs.size(); i++)
     out_blobs[i] = outputs[i].data();
@@ -565,7 +582,7 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
   if (dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false))
     LOG(INFO) << "test " << attrs.op->name;
   size_t num = std::min(outputs.size(), outputs_.size());
-  num = std::min(num_checks, num);
+  num        = std::min(num_checks, num);
   for (size_t i = 0; i < num; i++) {
     // We don't need to compare if it doesn't need to output data.
     if (req[i] == kNullOp)
@@ -580,10 +597,10 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
   }
 }
 
-void OpCheck::CopyResult(const std::vector<mxnet::NDArray> &outputs_,
-                         const std::vector<size_t> &indice) {
+void OpCheck::CopyResult(const std::vector<mxnet::NDArray>& outputs_,
+                         const std::vector<size_t>& indice) {
   CHECK(!MKLDNNStream::Get()->HasOps());
-  auto non_const_outputs_ = const_cast<std::vector<mxnet::NDArray> &>(outputs_);
+  auto non_const_outputs_ = const_cast<std::vector<mxnet::NDArray>&>(outputs_);
   for (auto i = indice.begin(); i != indice.end(); ++i) {
     auto mem = outputs[*i].GetMKLDNNData();
     non_const_outputs_[*i].CopyFrom(*mem);
@@ -591,14 +608,15 @@ void OpCheck::CopyResult(const std::vector<mxnet::NDArray> &outputs_,
   MKLDNNStream::Get()->Submit();
 }
 
-bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
+bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
                        const int dev_mask,
                        bool support_mkldnn,
-                       DispatchMode *dispatch_mode,
-                       std::vector<int> *in_attrs,
-                       std::vector<int> *out_attrs) {
+                       DispatchMode* dispatch_mode,
+                       std::vector<int>* in_attrs,
+                       std::vector<int>* out_attrs) {
   for (int& v : *in_attrs)
-    if (v == - 1) v = kDefaultStorage;
+    if (v == -1)
+      v = kDefaultStorage;
 
   DispatchMode wanted_mode;
 #if MXNET_USE_MKLDNN == 1
@@ -612,8 +630,8 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
 
   bool dispatched = false;
   if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
-    dispatched = op::storage_type_assign(out_attrs, mxnet::kDefaultStorage,
-                                         dispatch_mode, wanted_mode);
+    dispatched =
+        op::storage_type_assign(out_attrs, mxnet::kDefaultStorage, dispatch_mode, wanted_mode);
   }
   if (!dispatched) {
     dispatched = op::dispatch_fallback(out_attrs, dispatch_mode);
@@ -621,10 +639,10 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
   return dispatched;
 }
 
-inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray> &inputs) {
+inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray>& inputs) {
   std::vector<NDArray> ret;
   ret.reserve(inputs.size());
-  for (const auto &in : inputs) {
+  for (const auto& in : inputs) {
     if (in.IsView() && in.IsMKLDNNData()) {
       ret.push_back(in.Reorder2Default());
     } else {
@@ -635,11 +653,11 @@ inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<N
 }
 
 void MKLDNNRun(mxnet::FComputeEx fn,
-               const nnvm::NodeAttrs &attrs,
-               const mxnet::OpContext &ctx,
-               const std::vector<mxnet::NDArray> &inputs,
-               const std::vector<mxnet::OpReqType> &req,
-               const std::vector<mxnet::NDArray> &outputs) {
+               const nnvm::NodeAttrs& attrs,
+               const mxnet::OpContext& ctx,
+               const std::vector<mxnet::NDArray>& inputs,
+               const std::vector<mxnet::OpReqType>& req,
+               const std::vector<mxnet::NDArray>& outputs) {
   if (CheckMKLDNNInputArrayIsView(inputs)) {
     const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
     fn(attrs, ctx, mkldnn_inputs, req, outputs);
@@ -649,11 +667,11 @@ void MKLDNNRun(mxnet::FComputeEx fn,
 }
 
 void MKLDNNRun(FComputeExUnary fn,
-               const nnvm::NodeAttrs &attrs,
-               const mxnet::OpContext &ctx,
-               const mxnet::NDArray &input,
-               const mxnet::OpReqType &req,
-               const mxnet::NDArray &output) {
+               const nnvm::NodeAttrs& attrs,
+               const mxnet::OpContext& ctx,
+               const mxnet::NDArray& input,
+               const mxnet::OpReqType& req,
+               const mxnet::NDArray& output) {
   auto mkldnn_input = input;
   if (input.IsView() && input.IsMKLDNNData()) {
     mkldnn_input = input.Reorder2Default();
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
index 75c7c4d..67d7841 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -11,7 +11,7 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY92
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
@@ -21,56 +21,55 @@
  * \file mkldnn_batch_norm.cc
  * \brief
  * \author Tao Lv
-*/
+ */
 
 #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
 #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
 
 #if MXNET_USE_MKLDNN == 1
-#include <vector>
-#include <utility>
 #include <mkldnn.hpp>
+#include <utility>
+#include <vector>
+
 #include "../batch_norm-inl.h"
-#include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
 
-#define VARIANCE_TO_INVSTD(__var$,    __eps$)   (1.0/std::sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$)   ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
 namespace mxnet {
 namespace op {
 
-typedef mkldnn::batch_normalization_forward::primitive_desc     t_bn_f_pdesc;
-typedef mkldnn::batch_normalization_forward::desc               t_bn_f_desc;
-typedef mkldnn::batch_normalization_backward::primitive_desc    t_bn_b_pdesc;
-typedef mkldnn::batch_normalization_backward::desc              t_bn_b_desc;
+typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
+typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc;
+typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
+typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc;
 
-inline static mkldnn::normalization_flags _GetFlags(const std::vector<NDArray> &in_data,
-                                                    const std::vector<NDArray> &aux_states,
+inline static mkldnn::normalization_flags _GetFlags(const std::vector<NDArray>& in_data,
+                                                    const std::vector<NDArray>& aux_states,
                                                     bool is_train_and_not_global_stats,
                                                     bool fuse_relu) {
   mkldnn::normalization_flags flags = static_cast<mkldnn::normalization_flags>(0U);
   if (in_data.size() == 3U) {
-    flags |=  mkldnn::normalization_flags::use_scale_shift;
+    flags |= mkldnn::normalization_flags::use_scale_shift;
   }
 
   // aux_states[0]: inMean
   // aux_states[1]: inVariance
   if (aux_states.size() == 2U && !is_train_and_not_global_stats) {
-    flags |=  mkldnn::normalization_flags::use_global_stats;
+    flags |= mkldnn::normalization_flags::use_global_stats;
   }
 
   if (fuse_relu) {
-    flags |=  mkldnn::normalization_flags::fuse_norm_relu;
+    flags |= mkldnn::normalization_flags::fuse_norm_relu;
   }
   return flags;
 }
 
-inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
+inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory& data_mem,
                                    bool is_train,
                                    float eps,
                                    mkldnn::normalization_flags flags) {
-  auto data_md   = data_mem.get_desc();
-  auto engine    = CpuEngine::Get()->get_engine();
+  auto data_md = data_mem.get_desc();
+  auto engine  = CpuEngine::Get()->get_engine();
 
   if (is_train) {
     t_bn_f_desc bnFwd_desc(mkldnn::prop_kind::forward_training, data_md, eps, flags);
@@ -81,15 +80,15 @@ inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
   }
 }
 
-inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem,
-                                   const mkldnn::memory &diff_mem,
+inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory& data_mem,
+                                   const mkldnn::memory& diff_mem,
                                    float eps,
                                    mkldnn::normalization_flags flags) {
-  auto data_md    = data_mem.get_desc();
-  auto diff_md    = diff_mem.get_desc();
-  auto engine     = CpuEngine::Get()->get_engine();
+  auto data_md = data_mem.get_desc();
+  auto diff_md = diff_mem.get_desc();
+  auto engine  = CpuEngine::Get()->get_engine();
 
-  t_bn_b_desc  bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
+  t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
   return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags));
 }
 
@@ -102,28 +101,29 @@ class MKLDNNBNForward {
   t_bn_f_pdesc pd;
 
  public:
-  MKLDNNBNForward(const t_bn_f_pdesc &_pd, bool is_train_and_not_global_stats): pd(_pd) {
+  MKLDNNBNForward(const t_bn_f_pdesc& _pd, bool is_train_and_not_global_stats) : pd(_pd) {
     weight_m.reset(new mkldnn::memory(pd.weights_desc(), CpuEngine::Get()->get_engine()));
     fwd.reset(new mkldnn::batch_normalization_forward(pd));
     this->is_train_and_not_global_stats = is_train_and_not_global_stats;
   }
 
-  const mkldnn::memory &GetWeight() const {
+  const mkldnn::memory& GetWeight() const {
     return *weight_m;
   }
 
-  const t_bn_f_pdesc &GetPd() const {
+  const t_bn_f_pdesc& GetPd() const {
     return pd;
   }
 
-  const mkldnn::batch_normalization_forward &GetFwd() const {
+  const mkldnn::batch_normalization_forward& GetFwd() const {
     return *fwd;
   }
 };
 
-template<typename DType>
-static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
-                                     const OpContext &ctx, const mkldnn::memory *data_mem,
+template <typename DType>
+static MKLDNNBNForward& GetBNForward(const BatchNormParam& param,
+                                     const OpContext& ctx,
+                                     const mkldnn::memory* data_mem,
                                      mkldnn::normalization_flags flags) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
@@ -137,8 +137,7 @@ static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
-    auto fwd_pd = _GetFwd(*data_mem, ctx.is_train,
-                          param.eps, flags);
+    auto fwd_pd = _GetFwd(*data_mem, ctx.is_train, param.eps, flags);
     MKLDNNBNForward fwd(fwd_pd, ctx.is_train && !param.use_global_stats);
     it = AddToCache(&fwds, key, fwd);
   }
@@ -146,10 +145,13 @@ static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
 }
 
 template <typename DType>
-void MKLDNNBatchNormForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                            const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
-                            const std::vector<NDArray> &outputs, bool fuse_relu) {
-  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
+void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<NDArray>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<NDArray>& outputs,
+                            bool fuse_relu) {
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   std::vector<NDArray> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
 
   mxnet::TShape shape = inputs[batchnorm::kData].shape();
@@ -159,96 +161,92 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
   if (param.axis != 1 || shape.ndim() != 4) {
     // reshape to (N, C, 1, D)
     mxnet::TShape new_shape{
-      static_cast<dim_t>(shape.ProdShape(0, real_axis)),
-      shape[real_axis],
-      1,
-      static_cast<dim_t>(shape.ProdShape(real_axis + 1,
-            static_cast<int>(shape.ndim())))
-    };
+        static_cast<dim_t>(shape.ProdShape(0, real_axis)),
+        shape[real_axis],
+        1,
+        static_cast<dim_t>(shape.ProdShape(real_axis + 1, static_cast<int>(shape.ndim())))};
     in_data[batchnorm::kData] = in_data[batchnorm::kData].Reshape(new_shape);
-    out = out.Reshape(new_shape);
+    out                       = out.Reshape(new_shape);
   }
 
   const std::vector<NDArray> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
   TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
-  mkldnn::normalization_flags flags = _GetFlags(in_data,
-                                                aux_states,
-                                                ctx.is_train && !param.use_global_stats,
-                                                fuse_relu);
-  NDArray &data = in_data[batchnorm::kData];
+  mkldnn::normalization_flags flags =
+      _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
+  NDArray& data = in_data[batchnorm::kData];
   if (data.IsMKLDNNData() && data.IsView())
     data = data.Reorder2Default();
   auto data_mem = data.GetMKLDNNData();
-  auto &fwd = GetBNForward<DType>(param, ctx, data_mem, flags);
+  auto& fwd     = GetBNForward<DType>(param, ctx, data_mem, flags);
 
   // for output memory
-  auto out_mem = const_cast<NDArray &>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
+  auto out_mem = const_cast<NDArray&>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
 
   // mxnet will always use scale shift.
   // But if fix_gamma is true, then all scale elements will be set to 1.0f
   if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
-    const NDArray &gamma    = in_data[batchnorm::kGamma];
-    const NDArray &beta     = in_data[batchnorm::kBeta];
+    const NDArray& gamma = in_data[batchnorm::kGamma];
+    const NDArray& beta  = in_data[batchnorm::kBeta];
     CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage);
     CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage);
 
-    const mkldnn::memory &weight_mem = fwd.GetWeight();
-    float* weight_buf = reinterpret_cast<float *>(weight_mem.get_data_handle());
+    const mkldnn::memory& weight_mem = fwd.GetWeight();
+    float* weight_buf                = reinterpret_cast<float*>(weight_mem.get_data_handle());
 
     nnvm::dim_t channels_ = data.shape()[1];
     CHECK(weight_mem.get_desc().get_size() == channels_ * sizeof(float) * 2);
-    float* weight_ptr = gamma.data().dptr<float>();
-    float* bias_ptr = beta.data().dptr<float>();
+    float* weight_ptr      = gamma.data().dptr<float>();
+    float* bias_ptr        = beta.data().dptr<float>();
     const size_t copy_size = sizeof(weight_buf[0]) * channels_;
     if (!param.fix_gamma) {
       memcpy(weight_buf, weight_ptr, copy_size);
       memcpy(&weight_buf[channels_], bias_ptr, copy_size);
     } else if (IsBNWriting(req[batchnorm::kGamma])) {
       for (int i = 0; i < channels_; i++) {
-        weight_buf[i] = 1.0f;
-        weight_ptr[i] = 1.0f;
+        weight_buf[i]             = 1.0f;
+        weight_ptr[i]             = 1.0f;
         weight_buf[channels_ + i] = bias_ptr[i];  // bias
       }
     } else {
       for (int i = 0; i < channels_; i++) {
-        weight_buf[i] = 1.0f;
+        weight_buf[i]             = 1.0f;
         weight_buf[channels_ + i] = bias_ptr[i];  // bias
       }
     }
 
     mkldnn_args_map_t net_args;
-    net_args[MKLDNN_ARG_SRC] = *data_mem;
+    net_args[MKLDNN_ARG_SRC]         = *data_mem;
     net_args[MKLDNN_ARG_SCALE_SHIFT] = weight_mem;
-    net_args[MKLDNN_ARG_DST] = *out_mem;
+    net_args[MKLDNN_ARG_DST]         = *out_mem;
     if (fuse_relu) {
-      const NDArray *workspace = nullptr;
-      workspace = &outputs[3];
-      auto engine = CpuEngine::Get()->get_engine();
+      const NDArray* workspace = nullptr;
+      workspace                = &outputs[3];
+      auto engine              = CpuEngine::Get()->get_engine();
       if (workspace == nullptr) {
-          LOG(FATAL) << "MKLDNN BatchNorm: incorrect workspace input";
+        LOG(FATAL) << "MKLDNN BatchNorm: incorrect workspace input";
       }
-      auto ws = std::make_shared<mkldnn::memory>(fwd.GetPd().workspace_desc(),
-                        engine, workspace->GetMKLDNNData()->get_data_handle());
+      auto ws = std::make_shared<mkldnn::memory>(
+          fwd.GetPd().workspace_desc(), engine, workspace->GetMKLDNNData()->get_data_handle());
       net_args[MKLDNN_ARG_WORKSPACE] = *ws;
     }
     if (!ctx.is_train || param.use_global_stats) {
-      float* omean    = outputs[batchnorm::kMean].data().dptr<float>();
-      float* ovar     = outputs[batchnorm::kVar].data().dptr<float>();
-      float* inmean   = aux_states[batchnorm::kMovingMean].data().dptr<float>();
-      float* invar    = aux_states[batchnorm::kMovingVar].data().dptr<float>();
+      float* omean  = outputs[batchnorm::kMean].data().dptr<float>();
+      float* ovar   = outputs[batchnorm::kVar].data().dptr<float>();
+      float* inmean = aux_states[batchnorm::kMovingMean].data().dptr<float>();
+      float* invar  = aux_states[batchnorm::kMovingVar].data().dptr<float>();
       // to align with origin implmentation: batch_norm.cc: L164
       for (int i = 0; i < channels_; i++) {
         omean[i] = inmean[i];
-        ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps);
+        ovar[i]  = VARIANCE_TO_INVSTD(invar[i], param.eps);
       }
-      net_args[MKLDNN_ARG_MEAN] = *(aux_states[batchnorm::kMovingMean].GetMKLDNNData());
+      net_args[MKLDNN_ARG_MEAN]     = *(aux_states[batchnorm::kMovingMean].GetMKLDNNData());
       net_args[MKLDNN_ARG_VARIANCE] = *(aux_states[batchnorm::kMovingVar].GetMKLDNNData());
       MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
       MKLDNNStream::Get()->Submit();
     } else {  // training
-      const NDArray &outMean  = outputs[batchnorm::kMean];
-      const NDArray &outVar   = outputs[batchnorm::kVar];
-      net_args[MKLDNN_ARG_MEAN] = *(outMean.GetMKLDNNData());
+      const NDArray& outMean        = outputs[batchnorm::kMean];
+      const NDArray& outVar         = outputs[batchnorm::kVar];
+      net_args[MKLDNN_ARG_MEAN]     = *(outMean.GetMKLDNNData());
       net_args[MKLDNN_ARG_VARIANCE] = *(outVar.GetMKLDNNData());
       MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
       MKLDNNStream::Get()->Submit();
@@ -271,25 +269,34 @@ class MKLDNNBNBackward {
  public:
   const t_bn_b_pdesc pd;
 
-  explicit MKLDNNBNBackward(const t_bn_b_pdesc &_pd)
+  explicit MKLDNNBNBackward(const t_bn_b_pdesc& _pd)
       : weight_m(new mkldnn::memory(_pd.weights_desc(), CpuEngine::Get()->get_engine())),
         gradw_m(new mkldnn::memory(_pd.diff_weights_desc(), CpuEngine::Get()->get_engine())),
         pd(_pd) {
     bwd.reset(new mkldnn::batch_normalization_backward(pd));
   }
 
-  const mkldnn::memory &GetWeight() const { return *weight_m; }
+  const mkldnn::memory& GetWeight() const {
+    return *weight_m;
+  }
 
-  const mkldnn::memory &GetGradw() const { return *gradw_m; }
+  const mkldnn::memory& GetGradw() const {
+    return *gradw_m;
+  }
 
-  const mkldnn::batch_normalization_backward &GetBwd() const { return *bwd; }
+  const mkldnn::batch_normalization_backward& GetBwd() const {
+    return *bwd;
+  }
 };
 
 template <typename DType>
-static MKLDNNBNBackward &GetBNBackward(
-    const BatchNormParam &param, const OpContext &ctx, const NDArray &in_data,
-    const mkldnn::memory &in_mem, const NDArray &diff_data,
-    const mkldnn::memory &diff_mem, mkldnn::normalization_flags flags) {
+static MKLDNNBNBackward& GetBNBackward(const BatchNormParam& param,
+                                       const OpContext& ctx,
+                                       const NDArray& in_data,
+                                       const mkldnn::memory& in_mem,
+                                       const NDArray& diff_data,
+                                       const mkldnn::memory& diff_mem,
+                                       mkldnn::normalization_flags flags) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNBackward, OpHash> bwds;
 #else
@@ -310,41 +317,42 @@ static MKLDNNBNBackward &GetBNBackward(
 }
 
 template <typename DType>
-void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                             const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
-                             const std::vector<NDArray> &outputs, bool fuse_relu) {
+void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs,
+                             bool fuse_relu) {
   if (fuse_relu) {
     CHECK_EQ(inputs.size(), 9U);
   } else {
     CHECK_EQ(inputs.size(), 8U);
   }
-  const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
+  const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
   std::vector<NDArray> out_grad(1);
   std::vector<NDArray> out_data(3);
   std::vector<NDArray> in_data(3);
   std::vector<NDArray> aux_states(2);
-  out_grad[0] = inputs[0];
-  out_data[batchnorm::kMean] = inputs[1];
-  out_data[batchnorm::kVar] = inputs[2];
-  in_data[batchnorm::kData] = inputs[3];
-  in_data[batchnorm::kGamma] = inputs[4];
-  in_data[batchnorm::kBeta] = inputs[5];
-  aux_states[batchnorm::kMovingMean] = inputs[6];
-  aux_states[batchnorm::kMovingVar] = inputs[7];
-  const std::vector<NDArray> &in_grad = outputs;
+  out_grad[0]                         = inputs[0];
+  out_data[batchnorm::kMean]          = inputs[1];
+  out_data[batchnorm::kVar]           = inputs[2];
+  in_data[batchnorm::kData]           = inputs[3];
+  in_data[batchnorm::kGamma]          = inputs[4];
+  in_data[batchnorm::kBeta]           = inputs[5];
+  aux_states[batchnorm::kMovingMean]  = inputs[6];
+  aux_states[batchnorm::kMovingVar]   = inputs[7];
+  const std::vector<NDArray>& in_grad = outputs;
   TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
-  mkldnn::normalization_flags flags = _GetFlags(in_data,
-                                                aux_states,
-                                                ctx.is_train && !param.use_global_stats,
-                                                fuse_relu);
-
-  NDArray data         = in_data[batchnorm::kData];
-  NDArray diff         = out_grad[batchnorm::kOut];
-  NDArray gradIn       = in_grad[batchnorm::kData];
-  const NDArray &moving_mean  = aux_states[batchnorm::kMovingMean];
-  const NDArray &moving_var   = aux_states[batchnorm::kMovingVar];
-  const NDArray &out_mean     = out_data[batchnorm::kMean];
-  const NDArray &out_var      = out_data[batchnorm::kVar];
+  mkldnn::normalization_flags flags =
+      _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
+
+  NDArray data               = in_data[batchnorm::kData];
+  NDArray diff               = out_grad[batchnorm::kOut];
+  NDArray gradIn             = in_grad[batchnorm::kData];
+  const NDArray& moving_mean = aux_states[batchnorm::kMovingMean];
+  const NDArray& moving_var  = aux_states[batchnorm::kMovingVar];
+  const NDArray& out_mean    = out_data[batchnorm::kMean];
+  const NDArray& out_var     = out_data[batchnorm::kVar];
 
   CHECK(out_mean.IsDefaultData());
   CHECK(out_var.IsDefaultData());
@@ -357,36 +365,34 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
   if (param.axis != 1 || shape.ndim() != 4) {
     // reshape to (N, C, 1, D)
     mxnet::TShape new_shape{
-      static_cast<dim_t>(shape.ProdShape(0, real_axis)),
-      shape[real_axis],
-      1,
-      static_cast<dim_t>(shape.ProdShape(real_axis + 1,
-            static_cast<int>(shape.ndim())))
-    };
-    data = data.Reshape(new_shape);
-    diff = diff.Reshape(new_shape);
+        static_cast<dim_t>(shape.ProdShape(0, real_axis)),
+        shape[real_axis],
+        1,
+        static_cast<dim_t>(shape.ProdShape(real_axis + 1, static_cast<int>(shape.ndim())))};
+    data   = data.Reshape(new_shape);
+    diff   = diff.Reshape(new_shape);
     gradIn = gradIn.Reshape(new_shape);
   }
 
-  auto data_mem  = data.GetMKLDNNData();
-  auto diff_mem  = diff.GetMKLDNNData();
+  auto data_mem = data.GetMKLDNNData();
+  auto diff_mem = diff.GetMKLDNNData();
   // MKLDNN batchnorm should run on special layouts. If one of them isn't, we
   // should reorder them.
   if (data.IsDefaultData())
     data_mem = data.GetMKLDNNDataReorder(diff_mem->get_desc());
   else if (diff.IsDefaultData())
     diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_desc());
-  auto &bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
-  auto gradi_mem = CreateMKLDNNMem(const_cast<NDArray &>(gradIn),
-      bwd.pd.diff_src_desc(), req[batchnorm::kData]);
+  auto& bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
+  auto gradi_mem =
+      CreateMKLDNNMem(const_cast<NDArray&>(gradIn), bwd.pd.diff_src_desc(), req[batchnorm::kData]);
 
   if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
-    const NDArray &gamma    = in_data[batchnorm::kGamma];
-    const NDArray &beta     = in_data[batchnorm::kBeta];
-    DType *weight_buf = reinterpret_cast<DType *>(bwd.GetWeight().get_data_handle());
-    nnvm::dim_t channels_ = data.shape()[1];
-    DType *weight_ptr = gamma.data().dptr<DType>();
-    DType* bias_ptr = beta.data().dptr<DType>();
+    const NDArray& gamma   = in_data[batchnorm::kGamma];
+    const NDArray& beta    = in_data[batchnorm::kBeta];
+    DType* weight_buf      = reinterpret_cast<DType*>(bwd.GetWeight().get_data_handle());
+    nnvm::dim_t channels_  = data.shape()[1];
+    DType* weight_ptr      = gamma.data().dptr<DType>();
+    DType* bias_ptr        = beta.data().dptr<DType>();
     const size_t copy_size = sizeof(DType) * channels_;
     if (!param.fix_gamma) {
       memcpy(weight_buf, weight_ptr, copy_size);
@@ -398,15 +404,15 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
       memcpy(&weight_buf[channels_], bias_ptr, copy_size);
     }
     mkldnn_args_map_t net_args;
-    net_args[MKLDNN_ARG_SRC] = *data_mem;
-    net_args[MKLDNN_ARG_DIFF_SRC] = *gradi_mem.second;
-    net_args[MKLDNN_ARG_SCALE_SHIFT] = bwd.GetWeight();
+    net_args[MKLDNN_ARG_SRC]              = *data_mem;
+    net_args[MKLDNN_ARG_DIFF_SRC]         = *gradi_mem.second;
+    net_args[MKLDNN_ARG_SCALE_SHIFT]      = bwd.GetWeight();
     net_args[MKLDNN_ARG_DIFF_SCALE_SHIFT] = bwd.GetGradw();
-    net_args[MKLDNN_ARG_DIFF_DST] = *diff_mem;
+    net_args[MKLDNN_ARG_DIFF_DST]         = *diff_mem;
 
     if (fuse_relu) {
-      const NDArray *workspace = nullptr;
-      workspace = &inputs[8];
+      const NDArray* workspace = nullptr;
+      workspace                = &inputs[8];
       if (workspace != nullptr) {
         net_args[MKLDNN_ARG_WORKSPACE] = *(workspace->GetMKLDNNData());
       }
@@ -414,26 +420,24 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
 
     // training but no input mean and variance
     if (ctx.is_train && !param.use_global_stats) {
-      DType* moving_mean_ptr  = moving_mean.data().dptr<DType>();
-      DType* moving_var_ptr   = moving_var.data().dptr<DType>();
-      DType* out_mean_ptr     = out_mean.data().dptr<DType>();
-      DType* out_var_ptr      = out_var.data().dptr<DType>();
+      DType* moving_mean_ptr = moving_mean.data().dptr<DType>();
+      DType* moving_var_ptr  = moving_var.data().dptr<DType>();
+      DType* out_mean_ptr    = out_mean.data().dptr<DType>();
+      DType* out_var_ptr     = out_var.data().dptr<DType>();
       mkldnn::memory var_mem(bwd.pd.variance_desc(), CpuEngine::Get()->get_engine());
-      DType *tmp_var_ptr = reinterpret_cast<DType *>(var_mem.get_data_handle());
+      DType* tmp_var_ptr = reinterpret_cast<DType*>(var_mem.get_data_handle());
 
       DType minus_mom = (1.0f - param.momentum);
       for (int i = 0; i < channels_; i++) {
-        moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum +
-                             out_mean_ptr[i] * minus_mom;
-        float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
-        tmp_var_ptr[i] = variance;
-        moving_var_ptr[i] = moving_var_ptr[i] * param.momentum +
-                            variance * minus_mom;
+        moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum + out_mean_ptr[i] * minus_mom;
+        float variance     = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
+        tmp_var_ptr[i]     = variance;
+        moving_var_ptr[i]  = moving_var_ptr[i] * param.momentum + variance * minus_mom;
       }
-      net_args[MKLDNN_ARG_MEAN] = *(out_mean.GetMKLDNNData());
+      net_args[MKLDNN_ARG_MEAN]     = *(out_mean.GetMKLDNNData());
       net_args[MKLDNN_ARG_VARIANCE] = var_mem;
     } else {
-      net_args[MKLDNN_ARG_MEAN] =  *(moving_mean.GetMKLDNNData());
+      net_args[MKLDNN_ARG_MEAN]     = *(moving_mean.GetMKLDNNData());
       net_args[MKLDNN_ARG_VARIANCE] = *(moving_var.GetMKLDNNData());
     }
     MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
@@ -441,9 +445,9 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
     MKLDNNStream::Get()->Submit();
 
     // copy data from gradw_mem to in_grad[1] and in_grad[2]
-    DType *gw_buf = reinterpret_cast<DType *>(bwd.GetGradw().get_data_handle());
-    DType *w_grad_1 = in_grad[batchnorm::kGamma].data().dptr<DType>();
-    DType *w_grad_2 = in_grad[batchnorm::kBeta].data().dptr<DType>();
+    DType* gw_buf   = reinterpret_cast<DType*>(bwd.GetGradw().get_data_handle());
+    DType* w_grad_1 = in_grad[batchnorm::kGamma].data().dptr<DType>();
+    DType* w_grad_2 = in_grad[batchnorm::kBeta].data().dptr<DType>();
 
     // the gradient of gamma
     if (!param.fix_gamma) {
@@ -467,7 +471,7 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
       if (req[batchnorm::kBeta] != kAddTo) {
         memcpy(w_grad_2, &gw_buf[channels_], copy_size);
       } else {
-        DType *grad_beta = &gw_buf[channels_];
+        DType* grad_beta = &gw_buf[channels_];
         for (int i = 0; i < channels_; i++) {
           w_grad_2[i] += grad_beta[i];
         }
diff --git a/src/operator/nn/mkldnn/mkldnn_concat-inl.h b/src/operator/nn/mkldnn/mkldnn_concat-inl.h
index 66cb851..14f980a 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_concat-inl.h
@@ -21,17 +21,17 @@
  * \file mkldnn_concat-inl.h
  * \brief
  * \author
-*/
+ */
 #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
 #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
 
-
 #if MXNET_USE_MKLDNN == 1
-#include <vector>
 #include <utility>
+#include <vector>
+
 #include "../concat-inl.h"
-#include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -40,17 +40,19 @@ class MKLDNNConcatFwd {
  public:
   mkldnn::concat::primitive_desc fwd_pd;
 
-  MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md);
+  MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md);
 
-  const mkldnn::concat &GetFwd() const { return *fwd_; }
+  const mkldnn::concat& GetFwd() const {
+    return *fwd_;
+  }
 
  private:
   std::shared_ptr<mkldnn::concat> fwd_;
 };
 
-static MKLDNNConcatFwd &GetConcatForward(
-    int concat_dim, const std::vector<NDArray> &in_data,
-    const std::vector<mkldnn::memory::desc> &data_md) {
+static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
+                                         const std::vector<NDArray>& in_data,
+                                         const std::vector<mkldnn::memory::desc>& data_md) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
 #else
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
index 1dd2dc3..689888a 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -21,7 +21,7 @@
  * \file mkldnn_concat.cc
  * \brief
  * \author
-*/
+ */
 
 #if MXNET_USE_MKLDNN == 1
 #include "mkldnn_concat-inl.h"
@@ -29,15 +29,16 @@
 namespace mxnet {
 namespace op {
 
-static inline bool IsUsingPadding(const mkldnn::memory::desc &dst_md) {
+static inline bool IsUsingPadding(const mkldnn::memory::desc& dst_md) {
   // make sure a blocked format is used (at least one dimension is blocked)
-  bool is_blocked_format = dst_md.data.format_kind == mkldnn_blocked &&
-                           dst_md.data.format_desc.blocking.inner_nblks > 0;
-  return is_blocked_format && !std::equal(dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims,
-                                          dst_md.data.padded_dims);
+  bool is_blocked_format =
+      dst_md.data.format_kind == mkldnn_blocked && dst_md.data.format_desc.blocking.inner_nblks > 0;
+  return is_blocked_format &&
+         !std::equal(
+             dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims, dst_md.data.padded_dims);
 }
 
-MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md)
+MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md)
     : fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) {
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // compared to the actual size of the tensor. Currently, MKL-DNN operators
@@ -45,39 +46,39 @@ MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memor
   // format that has the expected memory size requirements (a plain format)
 
   // When fwd_pd uses padding, impose a plain format
-  const auto &dst_md = fwd_pd.dst_desc();
+  const auto& dst_md = fwd_pd.dst_desc();
   if (IsUsingPadding(dst_md)) {
-    auto plain_dst_tag = static_cast<mkldnn::memory::format_tag>(
-        GetDefaultFormat(dst_md.data.ndims));
+    auto plain_dst_tag =
+        static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(dst_md.data.ndims));
     auto plain_dst_md = mkldnn::memory::desc(dst_md.dims(), dst_md.data_type(), plain_dst_tag);
-    fwd_pd = mkldnn::concat::primitive_desc(plain_dst_md, concat_dim, data_md,
-                                            CpuEngine::Get()->get_engine());
+    fwd_pd            = mkldnn::concat::primitive_desc(
+        plain_dst_md, concat_dim, data_md, CpuEngine::Get()->get_engine());
   }
   fwd_ = std::make_shared<mkldnn::concat>(fwd_pd);
 }
 
-void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                         const std::vector<NDArray> &in_data,
-                         const std::vector<OpReqType> &req,
-                         const std::vector<NDArray> &out_data) {
+void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<NDArray>& in_data,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<NDArray>& out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
   const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  const int num_in_data = param.num_args;
-  const int concat_dim = param.dim;
+  const int num_in_data    = param.num_args;
+  const int concat_dim     = param.dim;
   std::vector<mkldnn::memory::desc> data_md;
-  std::vector<const mkldnn::memory *> data_mem;
+  std::vector<const mkldnn::memory*> data_mem;
   data_md.reserve(num_in_data);
   data_mem.reserve(num_in_data);
   for (int i = 0; i < num_in_data; i++) {
-    const mkldnn::memory *tmp_mem = in_data[i].GetMKLDNNData();
-    mkldnn::memory::desc tmp_md = tmp_mem->get_desc();
+    const mkldnn::memory* tmp_mem = in_data[i].GetMKLDNNData();
+    mkldnn::memory::desc tmp_md   = tmp_mem->get_desc();
     data_md.push_back(tmp_md);
     data_mem.push_back(tmp_mem);
   }
-  MKLDNNConcatFwd &fwd = GetConcatForward(concat_dim, in_data, data_md);
-  mxnet::mkldnn_output_t out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut],
-                                                   fwd.fwd_pd.dst_desc(),
-                                                   req[concat_enum::kOut]);
+  MKLDNNConcatFwd& fwd = GetConcatForward(concat_dim, in_data, data_md);
+  mxnet::mkldnn_output_t out_mem =
+      CreateMKLDNNMem(out_data[concat_enum::kOut], fwd.fwd_pd.dst_desc(), req[concat_enum::kOut]);
   std::unordered_map<int, mkldnn::memory> net_args;
   net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
   for (int i = 0; i < num_in_data; i++) {
@@ -88,35 +89,34 @@ void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   MKLDNNStream::Get()->Submit();
 }
 
-void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
                           const std::vector<NDArray>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
   const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
-  const int num_in_data = param.num_args;
-  const int axis = param.dim;
-  const auto gradz_mem = inputs[0].GetMKLDNNData();
+  const int num_in_data    = param.num_args;
+  const int axis           = param.dim;
+  const auto gradz_mem     = inputs[0].GetMKLDNNData();
   /* init the offset */
   mkldnn::memory::dims offsets(outputs[0].shape().ndim());
-  for (auto &v : offsets) {
+  for (auto& v : offsets) {
     v = 0;
   }
 
   for (int i = 0; i < num_in_data; i++) {
     mkldnn::memory::dims diff_src_tz(outputs[i].shape().begin(), outputs[i].shape().end());
     auto diff_src_md = outputs[i].GetMKLDNNData()->get_desc();
-    auto gradi_mem = CreateMKLDNNMem(outputs[i], diff_src_md, req[i]);
+    auto gradi_mem   = CreateMKLDNNMem(outputs[i], diff_src_md, req[i]);
 
     auto from_md = gradz_mem->get_desc().submemory_desc(diff_src_tz, offsets);
-    auto from_mem = new mkldnn::memory(from_md, gradz_mem->get_engine(),
-                                       gradz_mem->get_data_handle());
+    auto from_mem =
+        new mkldnn::memory(from_md, gradz_mem->get_engine(), gradz_mem->get_data_handle());
     offsets[axis] += diff_src_tz[axis];
 
-    std::unordered_map<int, mkldnn::memory> net_args({
-        {MKLDNN_ARG_FROM, *gradz_mem},
-        {MKLDNN_ARG_TO, *gradi_mem.second}
-    });
+    std::unordered_map<int, mkldnn::memory> net_args(
+        {{MKLDNN_ARG_FROM, *gradz_mem}, {MKLDNN_ARG_TO, *gradi_mem.second}});
     MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*from_mem, *gradi_mem.second), net_args);
     CommitOutput(outputs[i], gradi_mem);
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
index ac2d316..4292677 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -20,18 +20,19 @@
 /*!
  * \file mkldnn_convolution-inl.h
  * \brief
-*/
+ */
 
 #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
 
 #if MXNET_USE_MKLDNN == 1
 
-#include <vector>
 #include <utility>
+#include <vector>
+
 #include "../convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -47,26 +48,25 @@ struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
   dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
 
   DMLC_DECLARE_PARAMETER(MKLDNNConvParam) {
-    DMLC_DECLARE_FIELD(with_bn).set_default(false)
-    .describe("Add post batchnorm.");
-    DMLC_DECLARE_FIELD(with_act).set_default(false)
-    .describe("Add post activation");
-    DMLC_DECLARE_FIELD(with_sum).set_default(false)
-    .describe("Add post sum");
-    DMLC_DECLARE_FIELD(with_postsum_act).set_default(false)
-    .describe("Add post activation after sum");
-    DMLC_DECLARE_FIELD(quantized).set_default(false)
-    .describe("enable quantization");
+    DMLC_DECLARE_FIELD(with_bn).set_default(false).describe("Add post batchnorm.");
+    DMLC_DECLARE_FIELD(with_act).set_default(false).describe("Add post activation");
+    DMLC_DECLARE_FIELD(with_sum).set_default(false).describe("Add post sum");
+    DMLC_DECLARE_FIELD(with_postsum_act)
+        .set_default(false)
+        .describe("Add post activation after sum");
+    DMLC_DECLARE_FIELD(quantized).set_default(false).describe("enable quantization");
     DMLC_DECLARE_FIELD(min_calib_range)
-    .set_default(dmlc::optional<float>())
-    .describe("The minimum scalar value in the form of float32 obtained "
-              "through calibration. If present, it will be used to by "
-              "quantized convolution op to calculate primitive scale");
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The minimum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized convolution op to calculate primitive scale");
     DMLC_DECLARE_FIELD(max_calib_range)
-    .set_default(dmlc::optional<float>())
-    .describe("The maximum scalar value in the form of float32 obtained "
-              "through calibration. If present, it will be used to by "
-              "quantized convolution op to calculate primitive scale");
+        .set_default(dmlc::optional<float>())
+        .describe(
+            "The maximum scalar value in the form of float32 obtained "
+            "through calibration. If present, it will be used to by "
+            "quantized convolution op to calculate primitive scale");
   }
 };
 
@@ -80,17 +80,29 @@ struct MKLDNNConvFullParam {
 };
 
 std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
-    const ConvolutionParam &param, const bool is_train, const NDArray &data, const NDArray &weight,
-    const NDArray *bias, const NDArray &output);
+    const ConvolutionParam& param,
+    const bool is_train,
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray* bias,
+    const NDArray& output);
 
 class MKLDNNConvForward {
  public:
-  MKLDNNConvForward(const MKLDNNConvFullParam &param, const bool is_train, const NDArray &data,
-                    const NDArray &weight, const NDArray *bias, const NDArray &output);
-
-  const mkldnn::convolution_forward &GetFwd() const { return *fwd_; }
+  MKLDNNConvForward(const MKLDNNConvFullParam& param,
+                    const bool is_train,
+                    const NDArray& data,
+                    const NDArray& weight,
+                    const NDArray* bias,
+                    const NDArray& output);
+
+  const mkldnn::convolution_forward& GetFwd() const {
+    return *fwd_;
+  }
 
-  const mkldnn::convolution_forward::primitive_desc &GetPd() const { return *pd_; }
+  const mkldnn::convolution_forward::primitive_desc& GetPd() const {
+    return *pd_;
+  }
 
  private:
   std::shared_ptr<mkldnn::convolution_forward> fwd_;
@@ -99,37 +111,47 @@ class MKLDNNConvForward {
 
 typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
 
-MKLDNNConvForward &GetConvFwd(const MKLDNNConvFullParam &param, const bool is_train,
-                              const NDArray &data, const NDArray &weight, const NDArray *bias,
-                              const NDArray &output);
-
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam &param,
-                                         const OpContext &ctx,
-                                         MKLDNNConvForward *fwd,
-                                         const std::vector<NDArray> &in_data,
-                                         const std::vector<OpReqType> &req,
-                                         const std::vector<NDArray> &out_data);
-
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs &attrs,
-                              const OpContext &ctx,
-                              const std::vector<NDArray> &in_data,
-                              const std::vector<OpReqType> &req,
-                              const std::vector<NDArray> &out_data);
+MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
+                              const bool is_train,
+                              const NDArray& data,
+                              const NDArray& weight,
+                              const NDArray* bias,
+                              const NDArray& output);
+
+void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
+                                         const OpContext& ctx,
+                                         MKLDNNConvForward* fwd,
+                                         const std::vector<NDArray>& in_data,
+                                         const std::vector<OpReqType>& req,
+                                         const std::vector<NDArray>& out_data);
+
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data);
 
 class MKLDNNConvBackward {
  public:
-  MKLDNNConvBackward(const MKLDNNConvFullParam &param, const NDArray &data, const NDArray &weight,
-                     const NDArray *bias, const NDArray &output);
-
-  const mkldnn::convolution_backward_data &GetBwdData() const { return *bwd_data_; }
+  MKLDNNConvBackward(const MKLDNNConvFullParam& param,
+                     const NDArray& data,
+                     const NDArray& weight,
+                     const NDArray* bias,
+                     const NDArray& output);
+
+  const mkldnn::convolution_backward_data& GetBwdData() const {
+    return *bwd_data_;
+  }
 
-  const mkldnn::convolution_backward_weights &GetBwdWeights() const { return *bwd_weight_; }
+  const mkldnn::convolution_backward_weights& GetBwdWeights() const {
+    return *bwd_weight_;
+  }
 
-  const mkldnn::convolution_backward_data::primitive_desc &GetDataPd() const {
+  const mkldnn::convolution_backward_data::primitive_desc& GetDataPd() const {
     return *bwd_data_pd_;
   }
 
-  const mkldnn::convolution_backward_weights::primitive_desc &GetWeightsPd() const {
+  const mkldnn::convolution_backward_weights::primitive_desc& GetWeightsPd() const {
     return *bwd_weight_pd_;
   }
 
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index b042bd2..966ba21 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -26,41 +26,38 @@
 #if MXNET_USE_MKLDNN == 1
 
 #include "../convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
 #include "./mkldnn_convolution-inl.h"
+#include "./mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
 
 DMLC_REGISTER_PARAMETER(MKLDNNConvParam);
 
-bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
-  if ((params.kernel.ndim() != 1) &&
-      (params.kernel.ndim() != 2) &&
-      (params.kernel.ndim() != 3))
+bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input) {
+  if ((params.kernel.ndim() != 1) && (params.kernel.ndim() != 2) && (params.kernel.ndim() != 3))
     return false;
   return SupportMKLDNNQuantize(input.dtype()) &&
-         ((input.shape().ndim() == 3) ||
-          (input.shape().ndim() == 4) ||
+         ((input.shape().ndim() == 3) || (input.shape().ndim() == 4) ||
           (input.shape().ndim() == 5));
 }
 
 std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
-                                                           const MKLDNNConvFullParam &param,
-                                                           const bool is_train,
-                                                           const NDArray &data,
-                                                           const NDArray &weights,
-                                                           const NDArray *bias,
-                                                           const NDArray &output) {
+    const MKLDNNConvFullParam& param,
+    const bool is_train,
+    const NDArray& data,
+    const NDArray& weights,
+    const NDArray* bias,
+    const NDArray& output) {
   auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
-  auto data_md = GetMemDesc(data);
+  auto data_md   = GetMemDesc(data);
   auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.mkldnn_param.quantized);
-  auto out_md = GetMemDesc(output);
+  auto out_md    = GetMemDesc(output);
   auto bias_md =
       bias ? (param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
            : mkldnn::memory::desc{
-             {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
+                 {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
   auto bias_md_ptr = bias ? &bias_md : nullptr;
 
   mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
@@ -90,20 +87,20 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
     padding[1] = param.conv_param.pad[1];
     padding[2] = param.conv_param.pad[2];
   } else {
-    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size "
-               << param.conv_param.kernel.ndim() << ", supporting only 1 or 2 or 3.";
+    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.conv_param.kernel.ndim()
+               << ", supporting only 1 or 2 or 3.";
   }
   mkldnn::primitive_attr attr;
   mkldnn::post_ops ops;
   if (param.mkldnn_param.with_act) {
-    const auto &act_param = param.act_param;
+    const auto& act_param = param.act_param;
     ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
   }
   if (param.mkldnn_param.with_sum) {
     ops.append_sum(param.sum_scale);
   }
   if (param.mkldnn_param.with_postsum_act) {
-    const auto &act_param = param.postsum_act_param;
+    const auto& act_param = param.postsum_act_param;
     ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
   }
   attr.set_post_ops(ops);
@@ -112,42 +109,56 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
     int mask = (param.requantize_scales.size() > 1) ? 2 : 0;
     attr.set_output_scales(mask, param.requantize_scales);
   }
-  auto GetConvFwdPd = [&param, &data, &weights, &output,
-                       &attr](const mkldnn::convolution_forward::desc &desc) {
-    auto engine = CpuEngine::Get()->get_engine();
-    try {
-      // MKL-DNN introduced padded formats since 0.15 which require more memory
-      // compared to the actual size of the tensor. Currently, MKL-DNN operators
-      // still reuse memory from memory planning, so here we need to select a
-      // suboptimal kernel for computation that has the expected memory size requirements
-      auto conv_pd =
-          std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
-      while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
-             conv_pd->src_desc().get_size() != GetArraySize(data) ||
-             (!param.mkldnn_param.quantized &&
-              conv_pd->weights_desc().get_size() != GetArraySize(weights))) {
-        // next_impl() will visit desc and engine, please make sure they are still alive here.
-        CHECK(conv_pd->next_impl()) << "No convolution implementation for this request.";
-      }
-      return conv_pd;
-    } catch (mkldnn::error &e) {
-      if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
-        LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
-                      "required for int8 convolution";
-      } else {
-        LOG(ERROR) << e.message;
-      }
-      throw;
-    }
-  };
+  auto GetConvFwdPd =
+      [&param, &data, &weights, &output, &attr](const mkldnn::convolution_forward::desc& desc) {
+        auto engine = CpuEngine::Get()->get_engine();
+        try {
+          // MKL-DNN introduced padded formats since 0.15 which require more memory
+          // compared to the actual size of the tensor. Currently, MKL-DNN operators
+          // still reuse memory from memory planning, so here we need to select a
+          // suboptimal kernel for computation that has the expected memory size
+          // requirements
+          auto conv_pd =
+              std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
+          while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
+                 conv_pd->src_desc().get_size() != GetArraySize(data) ||
+                 (!param.mkldnn_param.quantized &&
+                  conv_pd->weights_desc().get_size() != GetArraySize(weights))) {
+            // next_impl() will visit desc and engine, please make sure they are
+            // still alive here.
+            CHECK(conv_pd->next_impl()) << "No convolution implementation for this request.";
+          }
+          return conv_pd;
+        } catch (mkldnn::error& e) {
+          if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
+            LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
+                          "required for int8 convolution";
+          } else {
+            LOG(ERROR) << e.message;
+          }
+          throw;
+        }
+      };
 
   if (param.conv_param.dilate.ndim() == 0 && bias_md_ptr == nullptr) {
-    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
-                                           weight_md, out_md, strides, padding, padding);
+    mkldnn::convolution_forward::desc desc(prop,
+                                           mkldnn::algorithm::convolution_direct,
+                                           data_md,
+                                           weight_md,
+                                           out_md,
+                                           strides,
+                                           padding,
+                                           padding);
     return GetConvFwdPd(desc);
   } else if (param.conv_param.dilate.ndim() == 0) {
-    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
-                                           weight_md, *bias_md_ptr, out_md, strides, padding,
+    mkldnn::convolution_forward::desc desc(prop,
+                                           mkldnn::algorithm::convolution_direct,
+                                           data_md,
+                                           weight_md,
+                                           *bias_md_ptr,
+                                           out_md,
+                                           strides,
+                                           padding,
                                            padding);
     return GetConvFwdPd(desc);
   } else {
@@ -166,25 +177,42 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
                  << ", supporting only 1 or 2 or 3.";
     }
     if (bias_md_ptr == nullptr) {
-      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
-                                             weight_md, out_md, strides, dilates, padding, padding);
+      mkldnn::convolution_forward::desc desc(prop,
+                                             mkldnn::algorithm::convolution_direct,
+                                             data_md,
+                                             weight_md,
+                                             out_md,
+                                             strides,
+                                             dilates,
+                                             padding,
+                                             padding);
       return GetConvFwdPd(desc);
     } else {
-      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
-                                             weight_md, *bias_md_ptr, out_md, strides, dilates,
-                                             padding, padding);
+      mkldnn::convolution_forward::desc desc(prop,
+                                             mkldnn::algorithm::convolution_direct,
+                                             data_md,
+                                             weight_md,
+                                             *bias_md_ptr,
+                                             out_md,
+                                             strides,
+                                             dilates,
+                                             padding,
+                                             padding);
       return GetConvFwdPd(desc);
     }
   }
 }
 
 static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetConvBwdData(
-    const ConvolutionParam &param, const NDArray &data, const NDArray &weight,
-    const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
-  auto data_md = GetMemDesc(data);
+    const ConvolutionParam& param,
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray& output,
+    const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+  auto data_md   = GetMemDesc(data);
   auto weight_md = GetWeightDesc(weight, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
+  auto out_md    = GetMemDesc(output);
+  auto engine    = CpuEngine::Get()->get_engine();
   mkldnn::memory::dims strides(param.kernel.ndim());
   mkldnn::memory::dims padding(param.kernel.ndim());
   if (param.kernel.ndim() == 1) {
@@ -216,32 +244,39 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
                << ", supporting only 1 or 2 or 3.";
   }
 
-  auto GetConvBwdDataPd = [&data, &weight, &output,
-                           &fwd_pd](const mkldnn::convolution_backward_data::desc &desc) {
+  auto GetConvBwdDataPd = [&data, &weight, &output, &fwd_pd](
+                              const mkldnn::convolution_backward_data::desc& desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
       // MKL-DNN introduced padded formats since 0.15 which require more memory
       // compared to the actual size of the tensor. Currently, MKL-DNN operators
       // still reuse memory from memory planning, so here we need to select a
-      // suboptimal kernel for computation that has the expected memory size requirements
+      // suboptimal kernel for computation that has the expected memory size
+      // requirements
       auto conv_pd =
           std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(desc, engine, fwd_pd);
       while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
              conv_pd->diff_src_desc().get_size() != GetArraySize(data) ||
              conv_pd->weights_desc().get_size() != GetArraySize(weight)) {
-        // next_impl() will visit desc and engine, please make sure they are still alive here.
+        // next_impl() will visit desc and engine, please make sure they are
+        // still alive here.
         CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
       }
       return conv_pd;
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       LOG(ERROR) << e.message;
       throw;
     }
   };
 
   if (param.dilate.ndim() == 0) {
-    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, data_md,
-                                                 weight_md, out_md, strides, padding, padding);
+    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+                                                 data_md,
+                                                 weight_md,
+                                                 out_md,
+                                                 strides,
+                                                 padding,
+                                                 padding);
     return GetConvBwdDataPd(desc);
   } else {
     mkldnn::memory::dims dilates(param.kernel.ndim());
@@ -255,23 +290,32 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
       dilates[1] = param.dilate[1] - 1;
       dilates[2] = param.dilate[2] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
-                 << param.dilate.ndim() << ", supporting only 1 or 2 or 3.";
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+                 << ", supporting only 1 or 2 or 3.";
     }
-    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, data_md,
-                                                 weight_md, out_md, strides, dilates, padding,
+    mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+                                                 data_md,
+                                                 weight_md,
+                                                 out_md,
+                                                 strides,
+                                                 dilates,
+                                                 padding,
                                                  padding);
     return GetConvBwdDataPd(desc);
   }
 }
 
 static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> GetConvBwdWeights(
-    const ConvolutionParam &param, const NDArray &data, const NDArray &weight, const NDArray *bias,
-    const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
-  auto data_md = GetMemDesc(data);
+    const ConvolutionParam& param,
+    const NDArray& data,
+    const NDArray& weight,
+    const NDArray* bias,
+    const NDArray& output,
+    const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+  auto data_md   = GetMemDesc(data);
   auto weight_md = GetWeightDesc(weight, param.num_group);
-  auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
+  auto out_md    = GetMemDesc(output);
+  auto engine    = CpuEngine::Get()->get_engine();
   mkldnn::memory::dims strides(param.kernel.ndim());
   mkldnn::memory::dims padding(param.kernel.ndim());
   if (param.kernel.ndim() == 1) {
@@ -303,37 +347,49 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
                << ", supporting only 1 or 2 or 3.";
   }
 
-  auto GetConvBwdWeightsPd = [&data, &weight, &output,
-                              &fwd_pd](const mkldnn::convolution_backward_weights::desc &desc) {
+  auto GetConvBwdWeightsPd = [&data, &weight, &output, &fwd_pd](
+                                 const mkldnn::convolution_backward_weights::desc& desc) {
     auto engine = CpuEngine::Get()->get_engine();
     try {
-      // MKL-DNN introduced padded formats since 0.15 which require more memory
-      // compared to the actual size of the tensor. Currently, MKL-DNN operators
-      // still reuse memory from memory planning, so here we need to select a
-      // suboptimal kernel for computation that has the expected memory size requirements
+      // MKL-DNN introduced padded formats since 0.15 which require more
+      // memory compared to the actual size of the tensor. Currently,
+      // MKL-DNN operators still reuse memory from memory planning, so here
+      // we need to select a suboptimal kernel for computation that has the
+      // expected memory size requirements
       auto conv_pd = std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
           desc, engine, fwd_pd);
       while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
              conv_pd->src_desc().get_size() != GetArraySize(data) ||
              conv_pd->diff_weights_desc().get_size() != GetArraySize(weight)) {
-        // next_impl() will visit desc and engine, please make sure they are still alive here.
+        // next_impl() will visit desc and engine, please make sure they are
+        // still alive here.
         CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
       }
       return conv_pd;
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       LOG(ERROR) << e.message;
       throw;
     }
   };
 
   if (param.dilate.ndim() == 0 && bias == nullptr) {
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md,
-                                                    weight_md, out_md, strides, padding, padding);
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+                                                    data_md,
+                                                    weight_md,
+                                                    out_md,
+                                                    strides,
+                                                    padding,
+                                                    padding);
     return GetConvBwdWeightsPd(desc);
   } else if (param.dilate.ndim() == 0) {
     auto bias_md = GetMemDesc(*bias);
-    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md,
-                                                    weight_md, bias_md, out_md, strides, padding,
+    mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+                                                    data_md,
+                                                    weight_md,
+                                                    bias_md,
+                                                    out_md,
+                                                    strides,
+                                                    padding,
                                                     padding);
     return GetConvBwdWeightsPd(desc);
   } else {
@@ -348,85 +404,106 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
       dilates[1] = param.dilate[1] - 1;
       dilates[2] = param.dilate[2] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
-                 << param.dilate.ndim() << ", supporting only 1 or 2 or 3.";
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+                 << ", supporting only 1 or 2 or 3.";
     }
     if (bias == nullptr) {
       mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                      data_md, weight_md, out_md, strides, dilates,
-                                                      padding, padding);
+                                                      data_md,
+                                                      weight_md,
+                                                      out_md,
+                                                      strides,
+                                                      dilates,
+                                                      padding,
+                                                      padding);
       return GetConvBwdWeightsPd(desc);
     } else {
       auto bias_md = GetMemDesc(*bias);
       mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
-                                                      data_md, weight_md, bias_md, out_md, strides,
-                                                      dilates, padding, padding);
+                                                      data_md,
+                                                      weight_md,
+                                                      bias_md,
+                                                      out_md,
+                                                      strides,
+                                                      dilates,
+                                                      padding,
+                                                      padding);
       return GetConvBwdWeightsPd(desc);
     }
   }
 }
 
-MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam &param, const bool is_train,
-                                     const NDArray &data, const NDArray &weight,
-                                     const NDArray *bias, const NDArray &output)
+MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam& param,
+                                     const bool is_train,
+                                     const NDArray& data,
+                                     const NDArray& weight,
+                                     const NDArray* bias,
+                                     const NDArray& output)
     : pd_(GetConvFwdImpl(param, is_train, data, weight, bias, output)) {
   fwd_ = std::make_shared<mkldnn::convolution_forward>(GetPd());
 }
 
-MKLDNNConvForward &GetConvFwd(const MKLDNNConvFullParam &param, const bool is_train,
-                              const NDArray &data, const NDArray &weight, const NDArray *bias,
-                              const NDArray &output) {
+MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
+                              const bool is_train,
+                              const NDArray& data,
+                              const NDArray& weight,
+                              const NDArray* bias,
+                              const NDArray& output) {
   using conv_fwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local conv_fwd_map fwds;
 #else
   static MX_THREAD_LOCAL conv_fwd_map fwds;
 #endif
-  // TODO(zhennan): Hash conv_param for now, need to hash full param if we want to enable cache for
-  // fused conv
+  // TODO(zhennan): Hash conv_param for now, need to hash full param if we want
+  // to enable cache for fused conv
   MKLDNNConvSignature key(param.conv_param);
   key.AddSign(is_train);
-  // Here we can sign the conv op with NDArray because conv primitive will decide the right layout
-  // for the, so we only need to get the shape and the data type of the arrays.
+  // Here we can sign the conv op with NDArray because conv primitive will
+  // decide the right layout for the, so we only need to get the shape and the
+  // data type of the arrays.
   key.AddSign(data);
   key.AddSign(weight);
   key.AddSign(output);
-  if (bias) key.AddSign(*bias);
+  if (bias)
+    key.AddSign(*bias);
 
   auto it = fwds.find(key);
   if (it == fwds.end()) {
     auto fwd = MKLDNNConvForward(param, is_train, data, weight, bias, output);
-    it = AddToCache(&fwds, key, fwd);
+    it       = AddToCache(&fwds, key, fwd);
   }
   return it->second;
 }
 
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam &param, const OpContext &ctx,
-                                         MKLDNNConvForward *fwd,
-                                         const std::vector<NDArray> &in_data,
-                                         const std::vector<OpReqType> &req,
-                                         const std::vector<NDArray> &out_data) {
+void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
+                                         const OpContext& ctx,
+                                         MKLDNNConvForward* fwd,
+                                         const std::vector<NDArray>& in_data,
+                                         const std::vector<OpReqType>& req,
+                                         const std::vector<NDArray>& out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
 
-  auto &data = in_data[conv::kData];
-  auto &weight = in_data[conv::kWeight];
+  auto& data   = in_data[conv::kData];
+  auto& weight = in_data[conv::kWeight];
   bool no_bias = param.conv_param.no_bias && !param.mkldnn_param.with_bn;
 
   auto data_mem = data.GetMKLDNNDataReorder(fwd->GetPd().src_desc());
-  const mkldnn::memory *weight_mem;
+  const mkldnn::memory* weight_mem;
   if (ctx.is_train) {
-    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it to the default format
-    // for now.
+    // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
+    // to the default format for now.
     if (weight.IsMKLDNNData())
-      // This asks the engine to change the layout of the weight array after it's used.
+      // This asks the engine to change the layout of the weight array after
+      // it's used.
       weight.Reorder2DefaultAsync();
     weight_mem = GetWeights(weight, fwd->GetPd().weights_desc(), param.conv_param.num_group);
   } else {
-    // For inference, we want to reorder the weight array so we don't need to reorder data every
-    // time.
+    // For inference, we want to reorder the weight array so we don't need to
+    // reorder data every time.
     if (weight.IsDefaultData()) {
-      // We also need to modify the layout on the original weight array. The data conversion happens
-      // after the weight array is used.
+      // We also need to modify the layout on the original weight array. The
+      // data conversion happens after the weight array is used.
       weight.MKLDNNDataReorderAsync(fwd->GetPd().weights_desc());
       weight_mem = GetWeights(weight, fwd->GetPd().weights_desc(), param.conv_param.num_group);
     } else {
@@ -436,14 +513,14 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam &param, const
   mkldnn_output_t out_mem;
   if (param.mkldnn_param.with_sum) {
     out_mem = mkldnn_output_t(OutDataOp::Noop,
-                              const_cast<mkldnn::memory *>(out_data[conv::kOut].GetMKLDNNData()));
+                              const_cast<mkldnn::memory*>(out_data[conv::kOut].GetMKLDNNData()));
   } else {
     out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd->GetPd().dst_desc(), req[conv::kOut]);
   }
 
   mkldnn_args_map_t net_args;
   if (!no_bias) {
-    const mkldnn::memory *bias_mem = in_data[conv::kBias].GetMKLDNNData();
+    const mkldnn::memory* bias_mem = in_data[conv::kBias].GetMKLDNNData();
     net_args.insert({MKLDNN_ARG_BIAS, *bias_mem});
   }
 
@@ -455,80 +532,91 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam &param, const
   MKLDNNStream::Get()->Submit();
 }
 
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                              const std::vector<NDArray> &in_data,
-                              const std::vector<OpReqType> &req,
-                              const std::vector<NDArray> &out_data) {
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& in_data,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& out_data) {
   MKLDNNConvFullParam param;
   param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
   param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
-  auto &fwd =
-      GetConvFwd(param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
-                 param.conv_param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
+  auto& fwd = GetConvFwd(param,
+                         ctx.is_train,
+                         in_data[conv::kData],
+                         in_data[conv::kWeight],
+                         param.conv_param.no_bias ? nullptr : &in_data[conv::kBias],
+                         out_data[conv::kOut]);
   MKLDNNConvolutionForwardFullFeature(param, ctx, &fwd, in_data, req, out_data);
 }
 
-MKLDNNConvBackward::MKLDNNConvBackward(const MKLDNNConvFullParam &param, const NDArray &data,
-                                       const NDArray &weight, const NDArray *bias,
-                                       const NDArray &output) {
+MKLDNNConvBackward::MKLDNNConvBackward(const MKLDNNConvFullParam& param,
+                                       const NDArray& data,
+                                       const NDArray& weight,
+                                       const NDArray* bias,
+                                       const NDArray& output) {
   const auto fwd_pd = GetConvFwdImpl(param, true, data, weight, bias, output);
-  bwd_data_pd_ = GetConvBwdData(param.conv_param, data, weight, output, *fwd_pd);
-  bwd_weight_pd_ = GetConvBwdWeights(param.conv_param, data, weight, bias, output, *fwd_pd);
-  bwd_data_ = std::make_shared<mkldnn::convolution_backward_data>(GetDataPd());
-  bwd_weight_ = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
+  bwd_data_pd_      = GetConvBwdData(param.conv_param, data, weight, output, *fwd_pd);
+  bwd_weight_pd_    = GetConvBwdWeights(param.conv_param, data, weight, bias, output, *fwd_pd);
+  bwd_data_         = std::make_shared<mkldnn::convolution_backward_data>(GetDataPd());
+  bwd_weight_       = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
 }
 
-static inline MKLDNNConvBackward &GetConvBwd(const MKLDNNConvFullParam &param, const NDArray &data,
-                                             const NDArray &weight, const NDArray *bias,
-                                             const NDArray &output) {
+static inline MKLDNNConvBackward& GetConvBwd(const MKLDNNConvFullParam& param,
+                                             const NDArray& data,
+                                             const NDArray& weight,
+                                             const NDArray* bias,
+                                             const NDArray& output) {
   using mkldnn_conv_bwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvBackward, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local mkldnn_conv_bwd_map bwds;
 #else
   static MX_THREAD_LOCAL mkldnn_conv_bwd_map bwds;
 #endif
-  // TODO(zhennan): Hash conv_param for now, need to hash full param if we want to enable cache for
-  // fused conv
+  // TODO(zhennan): Hash conv_param for now, need to hash full param if we want
+  // to enable cache for fused conv
   MKLDNNConvSignature key(param.conv_param);
-  // Here we can sign the conv op with NDArray because conv primitive will decide the right layout
-  // for the, so we only need to get the shape and the data type of the arrays.
+  // Here we can sign the conv op with NDArray because conv primitive will
+  // decide the right layout for the, so we only need to get the shape and the
+  // data type of the arrays.
   key.AddSign(data);
   key.AddSign(weight);
   key.AddSign(output);
-  if (bias) key.AddSign(*bias);
+  if (bias)
+    key.AddSign(*bias);
 
   auto it = bwds.find(key);
   if (it == bwds.end()) {
     auto bwd = MKLDNNConvBackward(param, data, weight, bias, output);
-    it = AddToCache(&bwds, key, bwd);
+    it       = AddToCache(&bwds, key, bwd);
   }
   return it->second;
 }
 
-void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
                                const std::vector<NDArray>& inputs,
                                const std::vector<OpReqType>& req,
                                const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
-  const std::vector<NDArray> &in_grad = outputs;
+  const std::vector<NDArray>& in_grad = outputs;
   MKLDNNConvFullParam full_param;
   full_param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
   full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
 
-  auto &data = inputs[conv::kData + 1];
-  auto &weight = inputs[conv::kWeight + 1];
-  const auto *bias = full_param.conv_param.no_bias ? nullptr : &inputs[conv::kBias + 1];
-  auto &out_grad = inputs[conv::kOut];
+  auto& data       = inputs[conv::kData + 1];
+  auto& weight     = inputs[conv::kWeight + 1];
+  const auto* bias = full_param.conv_param.no_bias ? nullptr : &inputs[conv::kBias + 1];
+  auto& out_grad   = inputs[conv::kOut];
 
-  const ConvolutionParam &param = full_param.conv_param;
+  const ConvolutionParam& param = full_param.conv_param;
 
   CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
-  MKLDNNConvBackward &convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
-  auto out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetDataPd().diff_dst_desc());
+  MKLDNNConvBackward& convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
+  auto out_grad_mem           = out_grad.GetMKLDNNDataReorder(convBwd.GetDataPd().diff_dst_desc());
   if (req[conv::kData]) {
-    auto weight_mem = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
-    auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(),
-                                       req[conv::kData]);
+    auto weight_mem  = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
+    auto in_grad_mem = CreateMKLDNNMem(
+        in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(), req[conv::kData]);
     MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdData(),
                                           {{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
                                            {MKLDNN_ARG_WEIGHTS, *weight_mem},
@@ -538,7 +626,7 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct
   if (req[conv::kWeight] || req[conv::kBias]) {
     if (convBwd.GetDataPd().diff_dst_desc() != convBwd.GetWeightsPd().diff_dst_desc())
       out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetWeightsPd().diff_dst_desc());
-    auto data_mem = data.GetMKLDNNDataReorder(convBwd.GetWeightsPd().src_desc());
+    auto data_mem       = data.GetMKLDNNDataReorder(convBwd.GetWeightsPd().src_desc());
     auto in_grad_weight = CreateMKLDNNWeightGrad(
         in_grad[conv::kWeight], convBwd.GetWeightsPd().diff_weights_desc(), req[conv::kWeight]);
 
@@ -547,9 +635,8 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct
                                   {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
     mkldnn_output_t in_grad_bias;
     if (!param.no_bias) {
-      in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias],
-                                          convBwd.GetWeightsPd().diff_bias_desc(),
-                                          req[conv::kBias]);
+      in_grad_bias = CreateMKLDNNMem(
+          in_grad[conv::kBias], convBwd.GetWeightsPd().diff_bias_desc(), req[conv::kBias]);
       net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second});
     }
     MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdWeights(), net_args);
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
index a67847f..601df3c 100644
--- a/src/operator/nn/mkldnn/mkldnn_copy.cc
+++ b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -21,19 +21,22 @@
  * \file mkldnn_copy.cc
  * \brief
  * \author
-*/
+ */
 
-#include "./mkldnn_ops-inl.h"
 #include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
 
 #if MXNET_USE_MKLDNN == 1
 namespace mxnet {
 namespace op {
 
-void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                const NDArray &in_data, const OpReqType &req,
-                const NDArray &out_data) {
-  if (req == kNullOp || req == kWriteInplace) return;
+void MKLDNNCopy(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const NDArray& in_data,
+                const OpReqType& req,
+                const NDArray& out_data) {
+  if (req == kNullOp || req == kWriteInplace)
+    return;
   TmpMemMgr::Get()->Init(ctx.requested[0]);
   auto in_mem = in_data.GetMKLDNNData();
   if (req == kAddTo) {
@@ -41,16 +44,16 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
     // We should try and force the input memory has the same format
     // as the input output. If not, we'll have to reorder memory.
     auto out_mem = out_data.GetMKLDNNData();
-    in_mem = in_data.GetMKLDNNData(out_mem ->get_desc());
+    in_mem       = in_data.GetMKLDNNData(out_mem->get_desc());
     if (in_mem == nullptr)
       in_mem = in_data.GetMKLDNNDataReorder(out_mem->get_desc());
     MKLDNNSum(*out_mem, *in_mem, *out_mem);
   } else {
-    const_cast<NDArray &>(out_data).CopyFrom(*in_mem);
+    const_cast<NDArray&>(out_data).CopyFrom(*in_mem);
   }
   MKLDNNStream::Get()->Submit();
 }
 
-}   // namespace op
-}   // namespace mxnet
+}  // namespace op
+}  // namespace mxnet
 #endif
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
index b51ec2a..b048c13 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
@@ -30,16 +30,17 @@
  *       (diff_bias) bias_grad <---|      |<--- weight
  *                                 |______|<--- bias
  *
- * "out" in this (and .cc) file will always refer to the output of Deconv FWD and
- * "out_grad" to its gradient. The corresponding MKLDNN names are in parentheses.
+ * "out" in this (and .cc) file will always refer to the output of Deconv FWD
+ * and "out_grad" to its gradient. The corresponding MKLDNN names are in
+ * parentheses.
  */
 #ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
 #define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
 
 #if MXNET_USE_MKLDNN == 1
+#include <numeric>
 #include <utility>
 #include <vector>
-#include <numeric>
 
 #include "../deconvolution-inl.h"
 #include "./mkldnn_base-inl.h"
@@ -48,20 +49,19 @@
 namespace mxnet {
 namespace op {
 
-using deconv_fwd_t = mkldnn::deconvolution_forward;
+using deconv_fwd_t    = mkldnn::deconvolution_forward;
 using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc;
 
-using deconv_bwd_data_t = mkldnn::deconvolution_backward_data;
+using deconv_bwd_data_t    = mkldnn::deconvolution_backward_data;
 using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc;
 
-using deconv_bwd_weights_t = mkldnn::deconvolution_backward_weights;
+using deconv_bwd_weights_t    = mkldnn::deconvolution_backward_weights;
 using deconv_bwd_weights_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc;
 
-
-
-// Swaps the logical order of dimensions that in plain format would correspond to input and output
-// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
-inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc,
+// Swaps the logical order of dimensions that in plain format would correspond
+// to input and output channels (for example: oihw => iohw, iohw => oihw, goihw
+// => giohw).
+inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc& desc,
                                               const uint32_t num_group) {
   std::vector<int> order(desc.data.ndims);
   std::iota(std::begin(order), std::end(order), 0);
@@ -71,158 +71,172 @@ inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc,
 }
 
 // Applies IOLogicalSwapDesc to MKLDNN memory of arr
-inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, const uint32_t num_group) {
+inline void IOLogicalSwapMKLDNNMem(const NDArray& arr, const uint32_t num_group) {
   mkldnn::memory::desc desc;
   if (arr.IsMKLDNNData()) {
     desc = arr.GetMKLDNNData()->get_desc();
   } else {
-    // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use
-    // descriptor from GetWeightDesc but with default format
-    const auto &temp = GetWeightDesc(arr, num_group);
-    desc = mkldnn::memory::desc(
-        temp.dims(), temp.data_type(),
+    // GetMKLDNNData won't take groups into account when creating
+    // mkldnn::memory, we need to use descriptor from GetWeightDesc but with
+    // default format
+    const auto& temp = GetWeightDesc(arr, num_group);
+    desc             = mkldnn::memory::desc(
+        temp.dims(),
+        temp.data_type(),
         static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
   }
-  const_cast<NDArray &>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_group));
+  const_cast<NDArray&>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_group));
 }
 
 // Version of GetWeightsDesc for deconvolution (with swap)
-inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray &weights, const uint32_t num_group) {
+inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray& weights, const uint32_t num_group) {
   return IOLogicalSwapDesc(GetWeightDesc(weights, num_group), num_group);
 }
 
-
-
 class MKLDNNDeconvFwd {
  public:
   struct Tensors {
-    Tensors(const NDArray &data, const NDArray &weights, const NDArray *const bias,
-            const NDArray &out);
-    Tensors(const bool no_bias, const std::vector<NDArray> &inputs,
-            const std::vector<NDArray> &outputs);
-
-    const NDArray &data;
-    const NDArray &weights;
-    const NDArray *const bias;
-    const NDArray &out;
+    Tensors(const NDArray& data,
+            const NDArray& weights,
+            const NDArray* const bias,
+            const NDArray& out);
+    Tensors(const bool no_bias,
+            const std::vector<NDArray>& inputs,
+            const std::vector<NDArray>& outputs);
+
+    const NDArray& data;
+    const NDArray& weights;
+    const NDArray* const bias;
+    const NDArray& out;
   };
 
-  static MKLDNNDeconvFwd &GetCached(const DeconvolutionParam &param, const Tensors &tensors);
-  static std::shared_ptr<deconv_fwd_pd_t> CreatePrimitiveDesc(const DeconvolutionParam &param,
-                                                              const Tensors &tensors);
+  static MKLDNNDeconvFwd& GetCached(const DeconvolutionParam& param, const Tensors& tensors);
+  static std::shared_ptr<deconv_fwd_pd_t> CreatePrimitiveDesc(const DeconvolutionParam& param,
+                                                              const Tensors& tensors);
 
-  MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors &tensors);
-  void ControlWeightsFormat(const uint32_t num_group, const bool is_train,
-                            const NDArray &weights) const;
-  void Execute(const uint32_t num_group, const OpReqType req, const Tensors &tensors) const;
+  MKLDNNDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors);
+  void ControlWeightsFormat(const uint32_t num_group,
+                            const bool is_train,
+                            const NDArray& weights) const;
+  void Execute(const uint32_t num_group, const OpReqType req, const Tensors& tensors) const;
 
  private:
-  const mkldnn::memory *DataMem(const NDArray &data) const;
-  const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const;
-  const mkldnn::memory *BiasMem(const NDArray &bias) const;
+  const mkldnn::memory* DataMem(const NDArray& data) const;
+  const mkldnn::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
+  const mkldnn::memory* BiasMem(const NDArray& bias) const;
 
-  mkldnn_output_t OutMem(const OpReqType req, const NDArray &out) const;
+  mkldnn_output_t OutMem(const OpReqType req, const NDArray& out) const;
 
  private:
   std::shared_ptr<deconv_fwd_t> fwd;
   std::shared_ptr<deconv_fwd_pd_t> fwd_pd;
 };
 
-
-MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias, const std::vector<NDArray> &inputs,
-                                  const std::vector<NDArray> &outputs)
+MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<NDArray>& outputs)
     : data(inputs[deconv::kData]),
       weights(inputs[deconv::kWeight]),
       bias(no_bias ? nullptr : &inputs[deconv::kBias]),
       out(outputs[deconv::kOut]) {}
 
-MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weights,
-                                  const NDArray *const bias, const NDArray &out)
+MKLDNNDeconvFwd::Tensors::Tensors(const NDArray& data,
+                                  const NDArray& weights,
+                                  const NDArray* const bias,
+                                  const NDArray& out)
     : data(data), weights(weights), bias(bias), out(out) {}
 
-MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam &param, const Tensors &tensors)
+MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors)
     : fwd_pd(CreatePrimitiveDesc(param, tensors)) {
   fwd = std::make_shared<deconv_fwd_t>(*fwd_pd);
 }
 
-inline const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const {
+inline const mkldnn::memory* MKLDNNDeconvFwd::DataMem(const NDArray& data) const {
   return data.GetMKLDNNDataReorder(fwd_pd->src_desc());
 }
 
-inline const mkldnn::memory *MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
-                                                         const NDArray &weights) const {
+inline const mkldnn::memory* MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
+                                                         const NDArray& weights) const {
   return GetWeights(weights, fwd_pd->weights_desc(), num_group);
 }
 
-inline const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const {
+inline const mkldnn::memory* MKLDNNDeconvFwd::BiasMem(const NDArray& bias) const {
   return bias.GetMKLDNNData();
 }
 
-inline mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray &out) const {
+inline mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray& out) const {
   return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req);
 }
 
-
-
 class MKLDNNDeconvBwd {
  public:
   struct ReadTensors {
-    ReadTensors(const bool no_bias, const std::vector<NDArray> &inputs);
-    const NDArray &data;
-    const NDArray &weights;
-    const NDArray *const bias;
-    const NDArray &out_grad;
+    ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs);
+    const NDArray& data;
+    const NDArray& weights;
+    const NDArray* const bias;
+    const NDArray& out_grad;
   };
   struct WriteTensors {
-    WriteTensors(const bool no_bias, const std::vector<NDArray> &outputs);
-    const NDArray &data_grad;
-    const NDArray &weights_grad;
-    const NDArray *const bias_grad;
+    WriteTensors(const bool no_bias, const std::vector<NDArray>& outputs);
+    const NDArray& data_grad;
+    const NDArray& weights_grad;
+    const NDArray* const bias_grad;
   };
 
-  static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam &param,
-                                    const ReadTensors &read_tensors);
+  static MKLDNNDeconvBwd& GetCached(const DeconvolutionParam& param,
+                                    const ReadTensors& read_tensors);
 
   static std::shared_ptr<deconv_bwd_data_pd_t> CreateDataPrimitiveDesc(
-      const DeconvolutionParam &param, const ReadTensors &read_tensors,
-      const deconv_fwd_pd_t &fwd_pd);
+      const DeconvolutionParam& param,
+      const ReadTensors& read_tensors,
+      const deconv_fwd_pd_t& fwd_pd);
 
   static std::shared_ptr<deconv_bwd_weights_pd_t> CreateWeightsPrimitiveDesc(
-      const DeconvolutionParam &param, const ReadTensors &read_tensors,
-      const deconv_fwd_pd_t &fwd_pd);
+      const DeconvolutionParam& param,
+      const ReadTensors& read_tensors,
+      const deconv_fwd_pd_t& fwd_pd);
 
-  MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &read_tensors);
+  MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors);
 
-  void Execute(const uint32_t num_group, const std::vector<OpReqType> &req,
-               const ReadTensors &read_tensors, const WriteTensors &write_tensors) const;
+  void Execute(const uint32_t num_group,
+               const std::vector<OpReqType>& req,
+               const ReadTensors& read_tensors,
+               const WriteTensors& write_tensors) const;
 
  private:
-  void IOSwapWeightsTensors(const uint32_t num_group, const std::vector<OpReqType> &req,
-                            const NDArray &weights, const NDArray &weights_grad) const;
-
-  // returns the output gradient memory used to calculate the data (input) gradient,
-  // which might be reused when calculating the gradient of weights
-  const mkldnn::memory *ScheduleBwdData(const uint32_t num_group, const OpReqType req,
-                                        const ReadTensors &read_tensors,
-                                        const WriteTensors &write_tensors) const;
-
-  void ScheduleBwdWeights(const uint32_t num_group, const std::vector<OpReqType> &req,
-                          const ReadTensors &read_tensors, const WriteTensors &write_tensors,
-                          const mkldnn::memory *const out_grad_mem) const;
-
-  const mkldnn::memory *DataMem(const NDArray &data) const;
-  const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const;
+  void IOSwapWeightsTensors(const uint32_t num_group,
+                            const std::vector<OpReqType>& req,
+                            const NDArray& weights,
+                            const NDArray& weights_grad) const;
+
+  // returns the output gradient memory used to calculate the data (input)
+  // gradient, which might be reused when calculating the gradient of weights
+  const mkldnn::memory* ScheduleBwdData(const uint32_t num_group,
+                                        const OpReqType req,
+                                        const ReadTensors& read_tensors,
+                                        const WriteTensors& write_tensors) const;
+
+  void ScheduleBwdWeights(const uint32_t num_group,
+                          const std::vector<OpReqType>& req,
+                          const ReadTensors& read_tensors,
+                          const WriteTensors& write_tensors,
+                          const mkldnn::memory* const out_grad_mem) const;
+
+  const mkldnn::memory* DataMem(const NDArray& data) const;
+  const mkldnn::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
 
   // for calculating the gradient of data (input)
-  const mkldnn::memory *OutGradMem(const NDArray &out_grad) const;
+  const mkldnn::memory* OutGradMem(const NDArray& out_grad) const;
   // for calculating the gradient of weights
-  const mkldnn::memory *OutGradMem(const NDArray &out_grad,
-                                   const mkldnn::memory *const out_grad_mem) const;
+  const mkldnn::memory* OutGradMem(const NDArray& out_grad,
+                                   const mkldnn::memory* const out_grad_mem) const;
 
-  mkldnn_output_t DataGradMem(const OpReqType req, const NDArray &data_grad) const;
-  mkldnn_output_t WeightsGradMem(const uint32_t num_group, const OpReqType req,
-                                 const NDArray &weights_grad) const;
-  mkldnn_output_t BiasGradMem(const OpReqType req, const NDArray *const bias) const;
+  mkldnn_output_t DataGradMem(const OpReqType req, const NDArray& data_grad) const;
+  mkldnn_output_t WeightsGradMem(const uint32_t num_group,
+                                 const OpReqType req,
+                                 const NDArray& weights_grad) const;
+  mkldnn_output_t BiasGradMem(const OpReqType req, const NDArray* const bias) const;
 
   std::shared_ptr<deconv_bwd_data_pd_t> bwd_data_pd;
   std::shared_ptr<deconv_bwd_weights_pd_t> bwd_weights_pd;
@@ -230,32 +244,32 @@ class MKLDNNDeconvBwd {
   std::shared_ptr<deconv_bwd_weights_t> bwd_weights;
 };
 
-
-MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray> &inputs)
+MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs)
     : data(inputs[deconv::kData + 1]),
       weights(inputs[deconv::kWeight + 1]),
       bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]),
       out_grad(inputs[deconv::kOut]) {}
 
-MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray> &outputs)
+MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray>& outputs)
     : data_grad(outputs[deconv::kData]),
       weights_grad(outputs[deconv::kWeight]),
       bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {}
 
-MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam &param, const ReadTensors &read_tensors) {
-  const auto &fwd_pd = MKLDNNDeconvFwd::CreatePrimitiveDesc(
-      param, MKLDNNDeconvFwd::Tensors(read_tensors.data, read_tensors.weights, read_tensors.bias,
-                                      read_tensors.out_grad));
-  bwd_data_pd = CreateDataPrimitiveDesc(param, read_tensors, *fwd_pd);
+MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors) {
+  const auto& fwd_pd = MKLDNNDeconvFwd::CreatePrimitiveDesc(
+      param,
+      MKLDNNDeconvFwd::Tensors(
+          read_tensors.data, read_tensors.weights, read_tensors.bias, read_tensors.out_grad));
+  bwd_data_pd    = CreateDataPrimitiveDesc(param, read_tensors, *fwd_pd);
   bwd_weights_pd = CreateWeightsPrimitiveDesc(param, read_tensors, *fwd_pd);
-  bwd_data = std::make_shared<deconv_bwd_data_t>(*bwd_data_pd);
-  bwd_weights = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
+  bwd_data       = std::make_shared<deconv_bwd_data_t>(*bwd_data_pd);
+  bwd_weights    = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
 }
 
 inline void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
-                                                  const std::vector<OpReqType> &req,
-                                                  const NDArray &weights,
-                                                  const NDArray &weights_grad) const {
+                                                  const std::vector<OpReqType>& req,
+                                                  const NDArray& weights,
+                                                  const NDArray& weights_grad) const {
   if (req[deconv::kData]) {
     IOLogicalSwapMKLDNNMem(weights, num_group);
   }
@@ -264,69 +278,74 @@ inline void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
   }
 }
 
-inline const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const {
+inline const mkldnn::memory* MKLDNNDeconvBwd::DataMem(const NDArray& data) const {
   return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc());
 }
 
-inline const mkldnn::memory *MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
-                                                         const NDArray &weights) const {
+inline const mkldnn::memory* MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
+                                                         const NDArray& weights) const {
   return GetWeights(weights, bwd_data_pd->weights_desc(), num_group);
 }
 
-inline const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const {
+inline const mkldnn::memory* MKLDNNDeconvBwd::OutGradMem(const NDArray& out_grad) const {
   return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc());
 }
 
-inline const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(
-    const NDArray &out_grad, const mkldnn::memory *const out_grad_mem) const {
+inline const mkldnn::memory* MKLDNNDeconvBwd::OutGradMem(
+    const NDArray& out_grad,
+    const mkldnn::memory* const out_grad_mem) const {
   return (out_grad_mem && out_grad_mem->get_desc() == bwd_weights_pd->diff_dst_desc())
              ? out_grad_mem
              : out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc());
 }
 
 inline mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req,
-                                                    const NDArray &data_grad) const {
+                                                    const NDArray& data_grad) const {
   return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req);
 }
 
 inline mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group,
                                                        const OpReqType req,
-                                                       const NDArray &weights_grad) const {
-  // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
-  // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad
-  // memory (which, when not swapped, is always in default format), so here we check if after a
+                                                       const NDArray& weights_grad) const {
+  // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat
+  // always fails (because of the logical swap - explained in
+  // MKLDNNDeconvFwd::Execute). We try to reuse weights_grad memory (which, when
+  // not swapped, is always in default format), so here we check if after a
   // swap, weights_md will have a default format
-  const auto &weights_md = bwd_weights_pd->diff_weights_desc();
+  const auto& weights_md = bwd_weights_pd->diff_weights_desc();
   if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(weights_md, num_group))) {
-    return {OutDataOp::Noop, const_cast<NDArray &>(weights_grad).CreateMKLDNNData(weights_md)};
+    return {OutDataOp::Noop, const_cast<NDArray&>(weights_grad).CreateMKLDNNData(weights_md)};
   }
   return CreateMKLDNNWeightGrad(weights_grad, weights_md, req);
 }
 
 inline mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req,
-                                                    const NDArray *const bias) const {
+                                                    const NDArray* const bias) const {
   return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
               : mkldnn_output_t(OutDataOp::Noop, nullptr);
 }
 
-
-
 // Utility class for creating operation descriptors of deconvolution primitives
 class DeconvDescCreator {
  public:
-  DeconvDescCreator(const DeconvolutionParam &param, const NDArray &data, const NDArray &weights,
-                    const NDArray *const bias, const NDArray &out);
-
-  // Imposes plain formats on memory descriptors with padding (so the next selected implementation
-  // will pass CheckImplSizeReq). After calling this method, new primitive descriptor (with new
-  // operator descriptor) should be created, which should select an implementation with matching
-  // size requirements.
-  // data_size, weights_size, out_size - size requirements of current implementation
-  // Returns whether successfully imposed a plain format on any of the data, weights, and output
-  // memory descriptors.
-  bool ImposePlainWherePadding(const size_t data_size, const size_t weights_size,
+  DeconvDescCreator(const DeconvolutionParam& param,
+                    const NDArray& data,
+                    const NDArray& weights,
+                    const NDArray* const bias,
+                    const NDArray& out);
+
+  // Imposes plain formats on memory descriptors with padding (so the next
+  // selected implementation will pass CheckImplSizeReq). After calling this
+  // method, new primitive descriptor (with new operator descriptor) should be
+  // created, which should select an implementation with matching size
+  // requirements. data_size, weights_size, out_size - size requirements of
+  // current implementation Returns whether successfully imposed a plain format
+  // on any of the data, weights, and output memory descriptors.
+  bool ImposePlainWherePadding(const size_t data_size,
+                               const size_t weights_size,
                                const size_t out_size);
-  bool CheckImplSizeReq(const size_t data_size, const size_t weights_size,
+  bool CheckImplSizeReq(const size_t data_size,
+                        const size_t weights_size,
                         const size_t out_size) const;
 
   deconv_fwd_t::desc CreateFwdDesc() const;
@@ -344,8 +363,8 @@ class DeconvDescCreator {
   mkldnn::memory::dims dilates;
 };
 
-
-inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const size_t weights_size,
+inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size,
+                                                const size_t weights_size,
                                                 const size_t out_size) const {
   // MKLDNN introduced padded formats since 0.15 which require more memory
   // compared to the actual size of the tensor. Currently, MKLDNN operators
@@ -357,18 +376,38 @@ inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const si
 
 inline deconv_fwd_t::desc DeconvDescCreator::CreateFwdDesc() const {
   return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training,
-                            mkldnn::algorithm::deconvolution_direct, data_md, weights_md, bias_md,
-                            out_md, strides, dilates, padding, padding);
+                            mkldnn::algorithm::deconvolution_direct,
+                            data_md,
+                            weights_md,
+                            bias_md,
+                            out_md,
+                            strides,
+                            dilates,
+                            padding,
+                            padding);
 }
 
 inline deconv_bwd_data_t::desc DeconvDescCreator::CreateBwdDataDesc() const {
-  return deconv_bwd_data_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md,
-                                 out_md, strides, dilates, padding, padding);
+  return deconv_bwd_data_t::desc(mkldnn::algorithm::deconvolution_direct,
+                                 data_md,
+                                 weights_md,
+                                 out_md,
+                                 strides,
+                                 dilates,
+                                 padding,
+                                 padding);
 }
 
 inline deconv_bwd_weights_t::desc DeconvDescCreator::CreateBwdWeightsDesc() const {
-  return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md,
-                                    bias_md, out_md, strides, dilates, padding, padding);
+  return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct,
+                                    data_md,
+                                    weights_md,
+                                    bias_md,
+                                    out_md,
+                                    strides,
+                                    dilates,
+                                    padding,
+                                    padding);
 }
 
 }  // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index 2160815..211ccd6 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -28,29 +28,28 @@
 namespace mxnet {
 namespace op {
 
-bool SupportMKLDNNDeconv(const DeconvolutionParam &params, const NDArray &input) {
+bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input) {
   return params.kernel.ndim() >= 1 && params.kernel.ndim() <= 3 &&
          input.shape().ndim() == (params.kernel.ndim() + 2) &&
          (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16);
 }
 
-
-
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                                const std::vector<NDArray> &inputs,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<NDArray> &outputs) {
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
-  const auto &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  const auto& param  = nnvm::get<DeconvolutionParam>(attrs.parsed);
   const auto tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs);
-  const auto &fwd = MKLDNNDeconvFwd::GetCached(param, tensors);
+  const auto& fwd    = MKLDNNDeconvFwd::GetCached(param, tensors);
 
   fwd.ControlWeightsFormat(param.num_group, ctx.is_train, tensors.weights);
   fwd.Execute(param.num_group, req[deconv::kOut], tensors);
 }
 
-MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam &param,
-                                            const Tensors &tensors) {
+MKLDNNDeconvFwd& MKLDNNDeconvFwd::GetCached(const DeconvolutionParam& param,
+                                            const Tensors& tensors) {
   using deconv_fwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvFwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local deconv_fwd_map fwds;
@@ -74,18 +73,20 @@ MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam &param,
 }
 
 std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
-    const DeconvolutionParam &param, const Tensors &tensors) {
+    const DeconvolutionParam& param,
+    const Tensors& tensors) {
   DeconvDescCreator ddc(param, tensors.data, tensors.weights, tensors.bias, tensors.out);
-  const auto &engine = CpuEngine::Get()->get_engine();
-  const auto pd = std::make_shared<deconv_fwd_pd_t>(ddc.CreateFwdDesc(), engine);
-  const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
+  const auto& engine          = CpuEngine::Get()->get_engine();
+  const auto pd               = std::make_shared<deconv_fwd_pd_t>(ddc.CreateFwdDesc(), engine);
+  const auto get_data_size    = [&pd]() { return pd->src_desc().get_size(); };
   const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); };
-  const auto get_out_size = [&pd]() { return pd->dst_desc().get_size(); };
+  const auto get_out_size     = [&pd]() { return pd->dst_desc().get_size(); };
 
   while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
     if (!pd->next_impl()) {
-      // ImposePlainWherePadding fails when all memory descriptors already have plain formats
-      // imposed, meaning there is no implementation with plain formats
+      // ImposePlainWherePadding fails when all memory descriptors already have
+      // plain formats imposed, meaning there is no implementation with plain
+      // formats
       CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
           << "No implementation of deconvolution forward propagation";
       *pd = deconv_fwd_pd_t(ddc.CreateFwdDesc(), engine);
@@ -94,13 +95,15 @@ std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
   return pd;
 }
 
-void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool is_train,
-                                           const NDArray &weights) const {
+void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group,
+                                           const bool is_train,
+                                           const NDArray& weights) const {
   if (is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
     // to the default format for now.
     if (weights.IsMKLDNNData()) {
-      // This asks the engine to change the layout of the weights array after it's used.
+      // This asks the engine to change the layout of the weights array after
+      // it's used.
       weights.Reorder2DefaultAsync();
     }
   } else {
@@ -117,32 +120,38 @@ void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool
   }
 }
 
-void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const OpReqType req,
-                              const Tensors &tensors) const {
-  // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives.
-  // For that, we would pass input tensor in place of output and output tensor in place of input
-  // (for appropriate convolution primitives: deconvolution forward = convolution backward data,
+void MKLDNNDeconvFwd::Execute(const uint32_t num_group,
+                              const OpReqType req,
+                              const Tensors& tensors) const {
+  // MXNet (correctly) assumes that deconvolution is implemented using
+  // convolution primitives. For that, we would pass input tensor in place of
+  // output and output tensor in place of input (for appropriate convolution
+  // primitives: deconvolution forward = convolution backward data,
   // deconvolution backward data = convolution forward).
   // The convolution primitive expects weights tensor with the shape of
-  // (primitive_out_channels, primitive_in_channels, h, w), but with swapped input and output:
-  // primitive_out_channels = deconv_in_channels, primitive_in_channels = deconv_out_channels,
-  // so it becomes (deconv_in_channels, deconv_out_channels, h, w) and MXNet provides such tensor.
+  // (primitive_out_channels, primitive_in_channels, h, w), but with swapped
+  // input and output: primitive_out_channels = deconv_in_channels,
+  // primitive_in_channels = deconv_out_channels, so it becomes
+  // (deconv_in_channels, deconv_out_channels, h, w) and MXNet provides such
+  // tensor.
   //
-  // MKLDNN deconvolution primitive also (as convolution) expects weights tensor with the shape of
-  // (primitive_out_channels, primitive_in_channels, h, w), but this time we don't swap input and
-  // output tensors, so:
-  // primitive_out_channels = deconv_out_channels, primitive_in_channels = deconv_in_channels,
-  // thus the current weights tensor won't fit (when deconv_out_channels != deconv_in_channels).
-  // However, underneath deconvolution MKLDNN also uses convolution, so even though it expects the
-  // weights tensor with the logical order of oihw, it wants its physical representation to
-  // match the order of iohw, which is the same as current weights tensor.
+  // MKLDNN deconvolution primitive also (as convolution) expects weights tensor
+  // with the shape of (primitive_out_channels, primitive_in_channels, h, w),
+  // but this time we don't swap input and output tensors, so:
+  // primitive_out_channels = deconv_out_channels, primitive_in_channels =
+  // deconv_in_channels, thus the current weights tensor won't fit (when
+  // deconv_out_channels != deconv_in_channels). However, underneath
+  // deconvolution MKLDNN also uses convolution, so even though it expects the
+  // weights tensor with the logical order of oihw, it wants its physical
+  // representation to match the order of iohw, which is the same as current
+  // weights tensor.
   //
-  // So here we swap logical order of input and output dimensions for weights tensor just for
-  // MKLDNN operations.
+  // So here we swap logical order of input and output dimensions for weights
+  // tensor just for MKLDNN operations.
   IOLogicalSwapMKLDNNMem(tensors.weights, num_group);
   {
     mkldnn_args_map_t net_args;
-    const auto &out_mem = OutMem(req, tensors.out);
+    const auto& out_mem = OutMem(req, tensors.out);
 
     net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)});
     net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)});
@@ -156,28 +165,28 @@ void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const OpReqType req,
     CommitOutput(tensors.out, out_mem);
     MKLDNNStream::Get()->Submit();
   }
-  IOLogicalSwapMKLDNNMem(tensors.weights, num_group);  // swap back from oihw to iohw
+  IOLogicalSwapMKLDNNMem(tensors.weights,
+                         num_group);  // swap back from oihw to iohw
 }
 
-
-
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                                 const std::vector<NDArray> &inputs,
-                                 const std::vector<OpReqType> &req,
-                                 const std::vector<NDArray> &outputs) {
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
+                                 const OpContext& ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs) {
   CHECK_NE(req[deconv::kWeight], kWriteInplace) << "Cannot write weights inplace";
 
   TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
-  const auto &param = nnvm::get<DeconvolutionParam>(attrs.parsed);
-  const auto read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
+  const auto& param        = nnvm::get<DeconvolutionParam>(attrs.parsed);
+  const auto read_tensors  = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
   const auto write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
-  MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, read_tensors);
+  MKLDNNDeconvBwd& bwd     = MKLDNNDeconvBwd::GetCached(param, read_tensors);
 
   bwd.Execute(param.num_group, req, read_tensors, write_tensors);
 }
 
-MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam &param,
-                                            const ReadTensors &read_tensors) {
+MKLDNNDeconvBwd& MKLDNNDeconvBwd::GetCached(const DeconvolutionParam& param,
+                                            const ReadTensors& read_tensors) {
   using deconv_bwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvBwd, OpHash>;
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local deconv_bwd_map bwds;
@@ -201,20 +210,22 @@ MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam &param,
 }
 
 std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
-    const DeconvolutionParam &param, const ReadTensors &read_tensors,
-    const deconv_fwd_pd_t &fwd_pd) {
-  DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, nullptr,
-                        read_tensors.out_grad);
-  const auto &engine = CpuEngine::Get()->get_engine();
+    const DeconvolutionParam& param,
+    const ReadTensors& read_tensors,
+    const deconv_fwd_pd_t& fwd_pd) {
+  DeconvDescCreator ddc(
+      param, read_tensors.data, read_tensors.weights, nullptr, read_tensors.out_grad);
+  const auto& engine = CpuEngine::Get()->get_engine();
   const auto pd = std::make_shared<deconv_bwd_data_pd_t>(ddc.CreateBwdDataDesc(), engine, fwd_pd);
-  const auto get_data_size = [&pd]() { return pd->diff_src_desc().get_size(); };
+  const auto get_data_size    = [&pd]() { return pd->diff_src_desc().get_size(); };
   const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); };
-  const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
+  const auto get_out_size     = [&pd]() { return pd->diff_dst_desc().get_size(); };
 
   while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
     if (!pd->next_impl()) {
-      // ImposePlainWherePadding fails when all memory descriptors already have plain formats
-      // imposed, meaning there is no implementation with plain formats
+      // ImposePlainWherePadding fails when all memory descriptors already have
+      // plain formats imposed, meaning there is no implementation with plain
+      // formats
       CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
           << "No implementation of deconvolution backward propagation";
       *pd = deconv_bwd_data_pd_t(ddc.CreateBwdDataDesc(), engine, fwd_pd);
@@ -224,21 +235,23 @@ std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
 }
 
 std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitiveDesc(
-    const DeconvolutionParam &param, const ReadTensors &read_tensors,
-    const deconv_fwd_pd_t &fwd_pd) {
-  DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, read_tensors.bias,
-                        read_tensors.out_grad);
-  const auto &engine = CpuEngine::Get()->get_engine();
+    const DeconvolutionParam& param,
+    const ReadTensors& read_tensors,
+    const deconv_fwd_pd_t& fwd_pd) {
+  DeconvDescCreator ddc(
+      param, read_tensors.data, read_tensors.weights, read_tensors.bias, read_tensors.out_grad);
+  const auto& engine = CpuEngine::Get()->get_engine();
   const auto pd =
       std::make_shared<deconv_bwd_weights_pd_t>(ddc.CreateBwdWeightsDesc(), engine, fwd_pd);
-  const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
+  const auto get_data_size    = [&pd]() { return pd->src_desc().get_size(); };
   const auto get_weights_size = [&pd]() { return pd->diff_weights_desc().get_size(); };
-  const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
+  const auto get_out_size     = [&pd]() { return pd->diff_dst_desc().get_size(); };
 
   while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
     if (!pd->next_impl()) {
-      // ImposePlainWherePadding fails when all memory descriptors already have plain formats
-      // imposed, meaning there is no implementation with plain formats
+      // ImposePlainWherePadding fails when all memory descriptors already have
+      // plain formats imposed, meaning there is no implementation with plain
+      // formats
       CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
           << "No implementation of calculating deconvolution weights gradient";
       *pd = deconv_bwd_weights_pd_t(ddc.CreateBwdWeightsDesc(), engine, fwd_pd);
@@ -247,13 +260,14 @@ std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitive
   return pd;
 }
 
... 14800 lines suppressed ...