You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/06/13 21:31:58 UTC

[GitHub] anirudh2290 closed pull request #11212: cherry-pick bug fixes in MKLDNN for v1.2.0

anirudh2290 closed pull request #11212: cherry-pick bug fixes in MKLDNN for v1.2.0
URL: https://github.com/apache/incubator-mxnet/pull/11212
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05d8021c367..ed96a6c8371 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,8 +187,12 @@ endif()
 
 if(USE_MKL_IF_AVAILABLE)
   if(USE_MKLDNN)
+    # We need to use generic archtecture. Otherwise, MKLDNN compiled in one
+    # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
+    set(ARCH_OPT_FLAGS "-mtune=generic")
     add_subdirectory(3rdparty/mkldnn)
     include_directories(3rdparty/mkldnn/include)
+    add_definitions(-DMXNET_USE_MKLDNN=1)
     list(APPEND mxnet_LINKER_LIBS mkldnn)
   endif()
   find_package(MKL)
@@ -197,10 +201,6 @@ if(USE_MKL_IF_AVAILABLE)
     include_directories(${MKL_INCLUDE_DIR})
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl)
 
-    if(USE_MKLDNN)
-      add_definitions(-DMXNET_USE_MKLDNN=1)
-    endif()
-
     add_definitions(-DUSE_MKL=1)
     add_definitions(-DCUB_MKL=1)
     list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES})
diff --git a/Jenkinsfile b/Jenkinsfile
index 8686012164d..84116e4d85b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -26,7 +26,7 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart
 mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a'
 // command to start a docker container
 docker_run = 'tests/ci_build/ci_build.sh'
@@ -107,6 +107,12 @@ def python3_ut(docker_container_name) {
   }
 }
 
+def python3_ut_mkldnn(docker_container_name) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    sh "ci/build.py --platform ${docker_container_name} /work/runtime_functions.sh unittest_ubuntu_python3_cpu_mkldnn"
+  }
+}
+
 // GPU test has two parts. 1) run unittest on GPU, 2) compare the results on
 // both CPU and GPU
 // Python 2
@@ -438,7 +444,7 @@ try {
         ws('workspace/ut-python3-mkldnn-cpu') {
           init_git()
           unpack_lib('mkldnn_cpu', mx_mkldnn_lib)
-          python3_ut('ubuntu_cpu')
+          python3_ut_mkldnn('ubuntu_cpu')
         }
       }
     },
@@ -528,6 +534,17 @@ try {
         }
       }
     },
+    'Cpp: MKLDNN+GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-cpp-mkldnn-gpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            unpack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
+            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_gpu_cpp"
+          }
+        }
+      }
+    },
     'R: CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-r-cpu') {
diff --git a/MKL_README.md b/MKL_README.md
index 5374adb8e42..a5c63b097c5 100644
--- a/MKL_README.md
+++ b/MKL_README.md
@@ -1,19 +1,77 @@
-# Full MKL Installation
-
-## Build/Install MXNet with a full MKL installation:
-Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace.
-
-  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/articles/intel-mkl-111-install-guide)
-
-  2. Set USE_BLAS=mkl in make/config.mk
-
-        1.1 Set ADD_LDFLAGS=-L<path/to/mkl/lib/folder> (ex. ADD_LDFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/lib)
-
-        1.1 Set ADD_CFLAGS=-I<path/to/mkl/include/folder> (ex. ADD_CFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/include)
-
-  3. Run 'make -j ${nproc}'
-
-  4. Navigate into the python directory
-
-  5. Run 'sudo python setup.py install'
-
+## Build/Install MXNet with a full MKL installation:
+
+To make it convenient for customers, Intel introduced a new license called [IntelĀ® Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license) that allows to redistribute not only dynamic libraries but also headers, examples and static libraries.
+
+Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace.
+
+  1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/mkl)
+
+  2. Run 'make -j ${nproc} USE_BLAS=mkl'
+
+  3. Navigate into the python directory
+
+  4. Run 'sudo python setup.py install'
+
+
+## Build/Install MXNet with MKLDNN on Windows:
+
+To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and Install [CMake](https://cmake.org/) if it is not already installed.
+3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
+4. Unzip the OpenCV package.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory``` (```C:\opencv\build\x64\vc14``` for example). Also, you need to add the OpenCV bin directory (```C:\opencv\build\x64\vc14\bin``` for example) to the ``PATH`` variable.
+6. If you have Intel Math Kernel Library (MKL) installed, set ```MKL_ROOT``` to point to ```MKL``` directory that contains the ```include``` and ```lib```. If you want to use MKL blas, you should set ```-DUSE_BLAS=mkl``` when cmake. Typically, you can find the directory in
+```C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\mkl```.
+7. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBLAS](http://sourceforge.net/projects/openblas/files/v0.2.14/). Note that you should also download ```mingw64.dll.zip`` along with openBLAS and add them to PATH.
+8. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```. 
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
+```
+    git clone https://github.com/apache/incubator-mxnet.git --recursive
+```
+
+2. Copy file `3rdparty/mkldnn/config_template.vcxproj` to incubator-mxnet root.
+
+3. Start a Visual Studio command prompt.
+
+4. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
+[CMake](https://cmake.org/) command:
+```
+    mkdir build
+    cd build
+    cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DUSE_MKLDNN=1 -DCMAKE_BUILD_TYPE=Release
+```
+
+5. In Visual Studio, open the solution file,```.sln```, and compile it.
+These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
+Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
+
+6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+
+## Install MXNet for Python
+
+1. Install ```Python``` using windows installer available [here](https://www.python.org/downloads/release/python-2712/).
+2. Install ```Numpy``` using windows installer available [here](http://scipy.org/install.html).
+3. Next, we install Python package interface for MXNet. You can find the Python interface package for [MXNet on GitHub](https://github.com/dmlc/mxnet/tree/master/python/mxnet).
+
+```CMD
+    cd python
+    python setup.py install
+```
+Done! We have installed MXNet with Python interface. Run below commands to verify our installation is successful.
+```CMD
+    # Open Python terminal
+    python
+
+    # You should be able to import mxnet library without any issues.
+    >>> import mxnet as mx;
+    >>> a = mx.nd.ones((2, 3));
+    >>> print ((a*2).asnumpy());
+        [[ 2.  2.  2.]
+        [ 2.  2.  2.]]
+```
+We actually did a small tensor computation using MXNet! You are all set with MKLDNN MXNet on your Windows machine.
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 44de137b6a8..4ab85064e30 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -323,6 +323,9 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
@@ -375,6 +378,18 @@ unittest_ubuntu_python3_cpu() {
     nosetests-3.4 --verbose tests/python/quantization
 }
 
+unittest_ubuntu_python3_cpu_mkldnn() {
+    set -ex
+    export PYTHONPATH=./python/ 
+    # MXNET_MKLDNN_DEBUG is buggy and produces false positives
+    # https://github.com/apache/incubator-mxnet/issues/10026
+    #export MXNET_MKLDNN_DEBUG=1  # Ignored if not present
+    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
+    nosetests-3.4 --verbose tests/python/unittest
+    nosetests-3.4 --verbose tests/python/quantization
+    nosetests-3.4 --verbose tests/python/mkl
+}
+
 unittest_ubuntu_python2_gpu() {
     set -ex
     export PYTHONPATH=./python/
diff --git a/docs/install/windows_setup.md b/docs/install/windows_setup.md
index 09a39e2c469..07027ad7457 100755
--- a/docs/install/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -34,9 +34,9 @@ To build and install MXNet yourself, you need the following dependencies. Instal
 
 After you have installed all of the required dependencies, build the MXNet source code:
 
-1. Download the MXNet source code from [GitHub](https://github.com/dmlc/mxnet). Don't forget to pull the submodules:
+1. Download the MXNet source code from [GitHub](https://github.com/apache/incubator-mxnet). Don't forget to pull the submodules:
 ```
-    git clone https://github.com/apache/incubator-mxnet.git ~/mxnet --recursive
+    git clone https://github.com/apache/incubator-mxnet.git --recursive
 ```
 2. Start a Visual Studio command prompt.
 3. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build``` or some other directory. Make sure to specify the architecture in the 
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index 29537d3dd82..b07f7d86dc5 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -76,8 +76,8 @@ inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
 }
 
 inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
-                                 const std::vector<OpReqType> &req,
                                  const std::vector<NDArray> *bufs,
+                                 std::vector<OpReqType> *req,
                                  std::vector<TBlob> *blobs,
                                  std::vector<NDArray> *temp_src,
                                  std::vector<NDArray> *temp_dst) {
@@ -86,9 +86,12 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
     auto& nd = src[i];
     bool is_default = nd.storage_type() == kDefaultStorage;
 #if MXNET_USE_MKLDNN == 1
-    // If it's writeTo, we don't need to worry whether it contains valid data.
-    if (req[i] == kWriteTo && is_default)
-      const_cast<NDArray &>(nd).InvalidateMKLDNNData();
+    if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
+      // If it's write inplace and the output array doesn't use the default
+      // layout, we'll generate a temporary output array below, which means
+      // the input array and the output array are no longer the same array.
+      // we should change the request type.
+      req->at(i) = kWriteTo;
     // We have to make sure it's default storage and default layout.
     is_default = nd.IsDefaultData();
 #endif
@@ -118,9 +121,9 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
  */
 inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
                                    const std::vector<NDArray> &ndoutputs,
-                                   const std::vector<OpReqType> &req,
                                    const std::vector<NDArray> *in_bufs,
                                    const std::vector<NDArray> *out_bufs,
+                                   std::vector<OpReqType> *req,
                                    std::vector<TBlob> *input_blobs,
                                    std::vector<TBlob> *output_blobs,
                                    std::vector<NDArray> *pre_temp_src,
@@ -133,7 +136,7 @@ inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
   SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst,
                       in_temp_idx_map);
   // populate output blobs
-  SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst,
+  SetupDefaultBlobsOut(ndoutputs, out_bufs, req, output_blobs, post_temp_dst,
                        post_temp_src);
   // add mutable inputs to post temp list
   for (const auto idx : mutate_idx) {
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index e4d49554620..17099653d70 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -78,7 +78,8 @@ class StorageFallbackOpExecutor : public OpExecutor {
     pre_temp_src_.clear(); pre_temp_dst_.clear();
     post_temp_src_.clear(); post_temp_dst_.clear();
     in_temp_idx_map_.clear();
-    SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_,
+    tmp_req = req;
+    SetupDefaultBlobsInOut(in_array, out_array, &pre_temp_buf_, &post_temp_buf_, &req,
                            &in_data_, &out_data_,
                            &pre_temp_src_, &pre_temp_dst_,
                            &post_temp_src_, &post_temp_dst_,
@@ -89,8 +90,12 @@ class StorageFallbackOpExecutor : public OpExecutor {
   // storage fallback after fcompute is completed
   void PostFCompute(bool is_gpu) {
     common::CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx, is_gpu);
+    req = tmp_req;
   }
 
+  // output requirement on each output array.
+  // This temporarily saves the original output requirements.
+  std::vector<OpReqType> tmp_req;
   // default storage tensor blobs for fcompute
   std::vector<TBlob> in_data_, out_data_;
   // These are NDArray buffers for cast storage.
@@ -113,6 +118,9 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
+#if MXNET_USE_MKLDNN == 1
+    InvalidateOutputs(out_array, req);
+#endif
     PreFCompute(is_gpu);
     fcompute_(state_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
@@ -146,6 +154,9 @@ class StatefulComputeExExecutor : public OpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
+#if MXNET_USE_MKLDNN == 1
+    InvalidateOutputs(out_array, req);
+#endif
     fcompute_(state_, op_ctx, in_array, req, out_array);
   }
 
@@ -178,6 +189,9 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
   void Run(RunContext rctx, bool is_gpu) override {
     using namespace common;
     op_ctx.run_ctx = rctx;
+#if MXNET_USE_MKLDNN == 1
+    InvalidateOutputs(out_array, req);
+#endif
     PreFCompute(is_gpu);
     fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 0d6525dce36..0956deb3a6d 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -29,6 +29,7 @@
 #include "../c_api/c_api_common.h"
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
+#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
 
 #ifndef MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
 #define MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
@@ -365,8 +366,12 @@ inline void PushFCompute(const FCompute& fn,
       std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
       // mapping from index in input_blobs to index in pre_temp_dst
       std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
+#if MXNET_USE_MKLDNN == 1
+      InvalidateOutputs(outputs, req);
+#endif
+      std::vector<OpReqType> tmp_req = req;
       // setup blobs
-      SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+      SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
                              &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
                              &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
       // setup context
@@ -374,7 +379,7 @@ inline void PushFCompute(const FCompute& fn,
       bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
       // pre-fcompute fallback, cast to default storage type
       CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
-      fn(attrs, opctx, input_blobs, req, output_blobs);
+      fn(attrs, opctx, input_blobs, tmp_req, output_blobs);
       // post-fcompute fallback, cast to original storage type
       CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
       if (is_gpu) {
@@ -402,6 +407,9 @@ inline void PushFComputeEx(const FComputeEx& fn,
   DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
   const auto& run = [=](RunContext rctx) {
       OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
+#if MXNET_USE_MKLDNN == 1
+      InvalidateOutputs(outputs, req);
+#endif
       fn(attrs, opctx, inputs, req, outputs);
       if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync) {
         rctx.get_stream<gpu>()->Wait();
@@ -445,6 +453,9 @@ inline void PushOperator(const OpStatePtr& state,
     const auto& run = [=](RunContext rctx,
                           engine::CallbackOnComplete on_complete) {
       OpContext opctx{is_train, rctx, on_complete, requested};
+#if MXNET_USE_MKLDNN == 1
+      InvalidateOutputs(outputs, req);
+#endif
       fcompute_ex(state, opctx, inputs, req, outputs);
       if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync) {
         rctx.get_stream<gpu>()->Wait();
@@ -475,15 +486,19 @@ inline void PushOperator(const OpStatePtr& state,
         std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
         // mapping from index in input_blobs to index in pre_temp_dst
         std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
+#if MXNET_USE_MKLDNN == 1
+        InvalidateOutputs(outputs, req);
+#endif
+        std::vector<OpReqType> tmp_req = req;
         // populate input blobs and output blobs
-        SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr,
+        SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
                                &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
                                &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
         // setup contexts
         bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
         // pre-fcompute fallback
         CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
-        fcompute(state, opctx, input_blobs, req, output_blobs);
+        fcompute(state, opctx, input_blobs, tmp_req, output_blobs);
         // post-fcompute fallback, cast to original storage type, if necessary
         CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
         if (is_gpu && exec_type == ExecType::kSync) {
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index d175a13632a..fc01c75b3ff 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -200,6 +200,7 @@ NDArray NDArray::MKLDNNDataReshape(const TShape &shape) const {
     ret.ptr_->delay_alloc = false;
     ret.ptr_->static_data = true;
     ret.byte_offset_ = byte_offset_;
+    ret.reuse_ = false;
     return ret;
   }
 }
@@ -217,6 +218,7 @@ NDArray NDArray::Reshape(const TShape &shape) const {
   // Otherwise, reshape only works on the default layout.
   CHECK_EQ(storage_type(), kDefaultStorage);
   ret.shape_ = shape;
+  ret.reuse_ = false;
   return ret;
 }
 
@@ -249,6 +251,7 @@ NDArray NDArray::Slice(index_t begin, index_t end) const {
   MSHADOW_TYPE_SWITCH(ret.dtype(), DType, {
     ret.byte_offset_ += begin * length * sizeof(DType);
   });
+  ret.reuse_ = false;
   ret.shape_[0] = end - begin;
   return ret;
 }
@@ -485,8 +488,8 @@ const mkldnn::memory *NDArray::GetMKLDNNData(
 }
 
 const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
-    const mkldnn::memory::primitive_desc &desc) const {
-  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    const mkldnn::memory::primitive_desc &new_pd) const {
+  if (new_pd.get_size() != shape().Size() * GetTypeSize(dtype_)) {
     LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
     return nullptr;
   }
@@ -495,24 +498,41 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
   const mkldnn::memory *mem = GetMKLDNNData();
   // If the memory descriptor matches, it's easy.
   MKLDNNStream *stream = MKLDNNStream::Get();
-  if (mem->get_primitive_desc() == desc) {
-    return GetMKLDNNExact(mem, desc);
+  if (mem->get_primitive_desc() == new_pd) {
+    return GetMKLDNNExact(mem, new_pd);
   }
 
-  mkldnn::memory::primitive_desc _desc = desc;
+  mkldnn::memory::primitive_desc _pd = new_pd;
+  mkldnn::memory::desc desc1 = mem->get_primitive_desc().desc();
+  mkldnn::memory::desc desc2 = _pd.desc();
   // Now we need to determine if we should reorder the memory.
   // If both use the default formats, we think we don't need to reorder.
-  mkldnn::memory::desc desc1 = mem->get_primitive_desc().desc();
-  mkldnn::memory::desc desc2 = _desc.desc();
   if (desc1.data.format == GetDefaultFormat(desc1) &&
       desc2.data.format == GetDefaultFormat(desc2)) {
-    mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle()));
+    mkldnn_mem_ptr ret(new mkldnn::memory(new_pd, mem->get_data_handle()));
     stream->RegisterMem(ret);
     return ret.get();
-  } else {
-    mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(desc);
+  } else if (same_shape(desc1, desc2)) {
+    // If they have the same shape, we can reorder data directly.
+    mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(new_pd);
     stream->RegisterPrim(mkldnn::reorder(*mem, *ret));
     return ret;
+  } else {
+    // If they have different shapes, we need to reshape the array first.
+    // Since this method will only be used inside an operator, we can call
+    // MKLDNNDataReshape to reshape an array.
+    TShape required_shape(desc2.data.ndims);
+    for (int i = 0; i < desc2.data.ndims; i++)
+      required_shape[i] = desc2.data.dims[i];
+    NDArray reshaped = MKLDNNDataReshape(required_shape);
+    const mkldnn::memory *ret = reshaped.GetMKLDNNData();
+    if (ret->get_primitive_desc() == new_pd) {
+      return GetMKLDNNExact(ret, new_pd);
+    } else {
+      mkldnn::memory *ret2 = TmpMemMgr::Get()->Alloc(new_pd);
+      stream->RegisterPrim(mkldnn::reorder(*ret, *ret2));
+      return ret2;
+    }
   }
 }
 
@@ -525,11 +545,19 @@ NDArray NDArray::Reorder2Default() const {
   if (format == ptr_->mkl_mem_->GetFormat())
     return *this;
 
-  NDArray ret(shape(), ctx(), false, dtype());
+  // create new ndarray from  mkldnn layout
+  mkldnn::memory::desc from_desc = ptr_->mkl_mem_->GetPrimitiveDesc().desc();
+  TShape tshape(from_desc.data.ndims);
+  for (int i = 0; i < from_desc.data.ndims; i++) tshape[i] = from_desc.data.dims[i];
+  NDArray ret(tshape, ctx(), false, dtype());
   mkldnn::memory::primitive_desc def_pd = ptr_->mkl_mem_->GetPrimitiveDesc(format);
   CHECK(ret.ptr_->shandle.size >= def_pd.get_size());
   mkldnn::memory def_mem(def_pd, ret.ptr_->shandle.dptr);
   ptr_->mkl_mem_->ReorderTo(&def_mem);
+  // reshape as needed
+  ret.shape_ = shape_;
+  ret.byte_offset_ = byte_offset_;
+  ret.reuse_ = false;
   return ret;
 }
 
@@ -559,34 +587,39 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc)
 
 const mkldnn::memory *NDArray::GetMKLDNNData() const {
   CHECK(storage_type() == kDefaultStorage);
-  // If this array uses MKLDNN layout, we have to make sure it's not a view.
-  // Otherwise, we'll have to change the layout inside the array.
-  if (IsMKLDNNData())
-    CHECK(!IsView());
-  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_);
-  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-  if (IsView()) {
-    mkldnn::memory::primitive_desc pd = ptr_->mkl_mem_->GetPrimitiveDesc();
-    // Sliced array must use the default layout.
-    CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format);
-    void *off_addr = static_cast<char *>(ptr_->mkl_mem_->GetDataHandle())
-        + byte_offset_;
-
+  bool is_view = IsView();
+  if (IsMKLDNNData()) {
+    // If this array uses MKLDNN layout, we have to make sure it's not a view.
+    // Otherwise, we'll have to change the layout inside the array.
+    CHECK(!is_view);
+    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
+    // If this array uses MKLDNN format, we should return now. Otherwise,
+    // SetMKLMem may mess up mkl_mem_.
+    return ptr_->mkl_mem_->GetRaw();
+  } else if (is_view) {
+    // If this is a view, we can't create a MKLDNN memory for the chunk
+    // because we don't have the complete data type and shape information for
+    // the chunk.
+    void *off_addr = static_cast<char *>(ptr_->shandle.dptr) + byte_offset_;
     // Create the primitive desc for the new mkldnn memory.
     mkldnn::memory::dims dims(shape().ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape()[i];
     mkldnn::memory::format cpp_format = static_cast<mkldnn::memory::format>(
         GetDefaultFormat(shape().ndim()));
-    mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
-        pd.desc().data.data_type);
+    mkldnn::memory::data_type cpp_type = get_mkldnn_type(dtype_);
     mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
-    mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine());
+    mkldnn::memory::primitive_desc new_pd(data_md,
+                                          CpuEngine::Get()->get_engine());
 
     std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(new_pd, off_addr));
     MKLDNNStream::Get()->RegisterMem(ret);
     return ret.get();
   } else {
+    // If this isn't a view, we can create a MKLDNN memory and store it in the
+    // chunk.
+    ptr_->SetMKLMem(shape_, dtype_);
+    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
     return ptr_->mkl_mem_->GetRaw();
   }
 }
@@ -601,20 +634,23 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
   MKLDNNStream *stream = MKLDNNStream::Get();
   // If this array uses MKLDNN layout, we have to make sure it's not a view.
   // Otherwise, we'll have to change the layout inside the array.
-  if (IsMKLDNNData())
-    CHECK(!IsView());
-  ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_,
-                  dtype_);
-  stream->RegisterMem(ptr_->mkl_mem_->GetMem());
-  mkldnn::memory::desc from_desc = mem.get_primitive_desc().desc();
-  mkldnn::memory::desc this_desc = ptr_->mkl_mem_->GetPrimitiveDesc().desc();
+
+  if (IsMKLDNNData() && IsView())
+    ptr_->Reorder2Default();
+
+  const mkldnn::memory *this_mem = GetMKLDNNData();
+  mkldnn::memory::primitive_desc from_pd = mem.get_primitive_desc();
+  mkldnn::memory::desc from_desc = from_pd.desc();
+  mkldnn::memory::primitive_desc this_pd = this_mem->get_primitive_desc();
+  mkldnn::memory::desc this_desc = this_pd.desc();
   mkldnn_memory_format_t from_def_format = GetDefaultFormat(from_desc);
+  mkldnn_memory_format_t this_def_format = GetDefaultFormat(this_desc);
   if (IsView()) {
     // Sliced array must use the default layout.
     CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format);
   }
   // It's possible that the memory and the NDArray don't have the same shape.
-  if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)
+  if (!same_shape(this_desc, from_desc)
       // If the source memory uses the default layout, we can reshape directly.
       && from_def_format == from_desc.data.format) {
     // In this case, we can simply create a new MKLDNN memory for the required
@@ -624,15 +660,14 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
     auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
     auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
     mkldnn::memory::desc data_md(dims, this_dtype, this_format);
-    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
+    mkldnn::memory::primitive_desc pd(data_md, from_pd.get_engine());
     mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_->GetRaw()));
-  } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) {
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
+  } else if (!same_shape(this_desc, from_desc)) {
     // In this case, the source memory stores data in a customized layout. We
     // need to reorganize the data in memory before we can reshape.
-    mkldnn::memory::primitive_desc def_pd = GetPrimitiveDesc(mem.get_primitive_desc(),
-                                                             from_def_format);
+    mkldnn::memory::primitive_desc def_pd = GetPrimitiveDesc(from_pd, from_def_format);
     mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_pd);
     stream->RegisterPrim(mkldnn::reorder(mem, *def_mem));
     // Now we can reshape it
@@ -641,45 +676,40 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
     auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
     auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
     mkldnn::memory::desc data_md(dims, this_dtype, this_format);
-    mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine());
+    mkldnn::memory::primitive_desc pd(data_md, from_pd.get_engine());
     mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle()));
     stream->RegisterMem(tmp_mem);
-    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_->GetRaw()));
-  } else if (mem.get_primitive_desc() == ptr_->mkl_mem_->GetPrimitiveDesc()) {
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
+  } else if (from_pd == this_pd) {
     // If the layout is the same, we can just copy data.
-    stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_->GetRaw()));
+    stream->RegisterPrim(mkldnn::reorder(mem, *this_mem));
   } else {
-    mkldnn_memory_format_t src_def = GetDefaultFormat(mem.get_primitive_desc().desc());
-    mkldnn_memory_format_t dst_def = ptr_->mkl_mem_->GetDefaultFormat();
     // If both are not using the default layouts. There isn't much we can do,
     // other than reorder data layout directly.
-    if (dst_def != ptr_->mkl_mem_->GetFormat()
-        && src_def != mem.get_primitive_desc().desc().data.format) {
-      stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_->GetRaw()));
-    } else if (dst_def == ptr_->mkl_mem_->GetFormat()) {
+    if (this_def_format != this_desc.data.format
+        && from_def_format != from_desc.data.format) {
+      stream->RegisterPrim(mkldnn::reorder(mem, *this_mem));
+    } else if (this_def_format == this_desc.data.format) {
       // If the dest mem uses the default memory layout, we can simply use
       // the default format of the source memory to improve perf of reorder.
-      mkldnn::memory::primitive_desc pd = ptr_->mkl_mem_->GetPrimitiveDesc(src_def);
-      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, ptr_->mkl_mem_->GetDataHandle()));
+      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(from_pd,
+                                                           from_def_format);
+      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, this_mem->get_data_handle()));
       stream->RegisterMem(tmp_mem);
       stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem));
     } else {
       // If the src mem uses the default memory layout, we can use
       // the default format of the source memory to improve perf.
-      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(mem.get_primitive_desc(), dst_def);
+      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(this_pd,
+                                                           this_def_format);
       mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
       stream->RegisterMem(tmp_mem);
-      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_->GetRaw()));
+      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
     }
   }
 }
-mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd,
-                                                mkldnn_memory_format_t format);
 
 mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) {
-  // This array shouldn't be a view.
-  CHECK(!IsView());
-
   if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
     LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
     return nullptr;
@@ -690,10 +720,26 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &
   mkldnn_memory_format_t def_format = GetDefaultFormat(_desc.desc());
   // If the required format is a default format, we don't need to worry about the shape.
   // If the shape isn't the same, it actually implicitly reshapes data.
-  if (required_format == def_format) {
+  if (required_format == def_format && !IsView()) {
     ptr_->SetMKLMem(shape_, dtype_);
     MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
     return GetMKLDNNExact(ptr_->mkl_mem_->GetRaw(), desc);
+  } else if (required_format == def_format) {
+    ptr_->CheckAndAlloc();
+    CHECK(ptr_->shandle.dptr);
+    // When this is a view and a user wants the default layout, we can simply
+    // create a new mkldnn memory that points to the right memory.
+    std::shared_ptr<mkldnn::memory> mem(new mkldnn::memory(
+            desc, static_cast<char *>(ptr_->shandle.dptr) + byte_offset_));
+    MKLDNNStream::Get()->RegisterMem(mem);
+    return mem.get();
+  } else if (IsView()) {
+    // If this is a view and a user wants to write data to it with special
+    // a MKLDNN format, we should reorder the data in the array and return NULL.
+    // In this way, the user will create a new NDArray for the special format
+    // and copy data back.
+    ptr_->Reorder2Default();
+    return nullptr;
   }
 
   if (ptr_->mkl_mem_)
@@ -1082,9 +1128,8 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
                              to_mem->get_primitive_desc().get_size());
       memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
     } else {
-      std::vector<mkldnn::primitive> net;
-      net.push_back(mkldnn::reorder(*from_mem, *to_mem));
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+      const_cast<NDArray &>(to).CopyFrom(*from_mem);
+      MKLDNNStream::Get()->Submit();
     }
   } else {
     // In this case, one of the NDArray isn't supported by MKLDNN, we need
diff --git a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
index 239da023668..0845eb79fd6 100644
--- a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
+++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h
@@ -48,7 +48,7 @@ class CuDNNSoftmaxActivationOp {
   }
 
   void Forward(const OpContext &ctx, const TBlob &in_data,
-      const OpReqType &req, const TBlob &out_data) {
+               const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<gpu> *s = ctx.get_stream<gpu>();
@@ -102,14 +102,14 @@ class CuDNNSoftmaxActivationOp {
   }
 
   void Backward(const OpContext &ctx, const TBlob &out_grad,
-      const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) {
+                const TBlob &out_data, const OpReqType &req,
+                const TBlob &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4> grad;
-    Tensor<gpu, 4> data;
     Tensor<gpu, 4> output_data;
     Tensor<gpu, 4> input_grad;
     cudnnSoftmaxMode_t softmax_mode;
@@ -141,6 +141,13 @@ class CuDNNSoftmaxActivationOp {
       softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL;
     }
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype_,
+                                          input_grad.shape_[0],
+                                          input_grad.shape_[1],
+                                          input_grad.shape_[2],
+                                          input_grad.shape_[3]));
     CUDNN_CALL(cudnnSoftmaxBackward(s->dnn_handle_,
                                     CUDNN_SOFTMAX_ACCURATE,
                                     softmax_mode,
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 489351ebe2c..48a029817d1 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -67,7 +67,8 @@ class CpuEngine {
  public:
   static CpuEngine *Get() {
     // I's thread-safe in C++11.
-    static thread_local CpuEngine myInstance;
+    // ensure same mkldnn engine is used across threads
+    static CpuEngine myInstance;
     return &myInstance;
   }
   CpuEngine(CpuEngine const &) = delete;             // Copy construct
@@ -272,12 +273,11 @@ class MKLDNNStream {
   std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
 
  public:
-  static MKLDNNStream *Get() {
-    static thread_local MKLDNNStream stream;
-    return &stream;
-  }
+  static MKLDNNStream *Get();
 
-  void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); }
+  void RegisterPrim(const mkldnn::primitive &prim) {
+    net.push_back(prim);
+  }
 
   void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
     mem_holder.push_back(mem);
@@ -287,10 +287,21 @@ class MKLDNNStream {
     return !net.empty();
   }
 
-  void Submit() {
-    if (!net.empty())
+  /*
+   * After submitting mkldnn operations for execution, we need to
+   * clean up memory held by the stream. However, sometimes users
+   * might want to separate mkldnn execution and memory cleanup.
+   */
+  void Submit(bool cleanup = true) {
+    if (!net.empty()) {
       mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
-    net.clear();
+      net.clear();
+    }
+    if (cleanup)
+      Cleanup();
+  }
+
+  void Cleanup() {
     mem_holder.clear();
     TmpMemMgr::Get()->Reset();
   }
@@ -348,6 +359,16 @@ inline bool same_shape(const TShape &shape, const mkldnn_dims_t dims, int ndims)
   return true;
 }
 
+inline bool same_shape(const mkldnn::memory::desc &desc1,
+                       const mkldnn::memory::desc &desc2) {
+  if (desc1.data.ndims != desc2.data.ndims)
+    return false;
+  for (int i = 0; i < desc1.data.ndims; i++)
+    if (desc1.data.dims[i] != desc2.data.dims[i])
+      return false;
+  return true;
+}
+
 inline bool same_shape(const TShape &shape, int dtype,
                        const mkldnn::memory::desc &desc) {
   return same_shape(shape, desc.data.dims, desc.data.ndims)
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 684abd24685..9fa93a11b89 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -25,6 +25,11 @@
 
 namespace mxnet {
 
+MKLDNNStream *MKLDNNStream::Get() {
+  static thread_local MKLDNNStream stream;
+  return &stream;
+}
+
 void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
   if (size > *space)
     return nullptr;
@@ -57,8 +62,11 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
     this->curr_mem = static_cast<char *>(mem) + pd.get_size();
     return ret.get();
   } else {
-    LOG(WARNING) << "Allocate " << pd.get_size()
-        << " bytes with malloc directly";
+    // If curr_mem has been initialized and we still reach here. It means
+    // the current allocated memory isn't enough.
+    if (this->curr_mem)
+      LOG(WARNING) << "Allocate " << pd.get_size()
+          << " bytes with malloc directly";
     mkldnn_mem_ptr ret(new mkldnn::memory(pd));
     MKLDNNStream::Get()->RegisterMem(ret);
     return ret.get();
@@ -282,10 +290,7 @@ void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
     } else {
       if (in_bufs.empty())
         in_bufs.reserve(inputs.size());
-      in_bufs.emplace_back(inputs[i].shape(), inputs[i].ctx(),
-                           false, inputs[i].dtype());
-      const mkldnn::memory *mem = inputs[i].GetMKLDNNData();
-      in_bufs.back().CopyFrom(*mem);
+      in_bufs.push_back(inputs[i].Reorder2Default());
       in_blobs[i] = in_bufs.back().data();
     }
   }
@@ -293,10 +298,15 @@ void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
 
   std::vector<TBlob> out_blobs(outputs.size());
   for (size_t i = 0; i < out_blobs.size(); i++) {
-    if (req[i] == kWriteTo)
-      const_cast<NDArray &>(outputs[i]).InvalidateMKLDNNData();
-    CHECK(outputs[i].IsDefaultData());
-    out_blobs[i] = outputs[i].data();
+    NDArray output = outputs[i];
+    // ensure output does not use mkldnn mem.
+    // for inplace, we already converted & copied input above.
+    if ((req[i] == kWriteTo) || (req[i] == kWriteInplace))
+      const_cast<NDArray &>(output).InvalidateMKLDNNData();
+    else if (req[i] == kAddTo)
+      output = outputs[i].Reorder2Default();
+    CHECK(output.IsDefaultData());
+    out_blobs[i] = output.data();
   }
   fn(attrs, ctx, in_blobs, req, out_blobs);
 }
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
index 71d540c969c..9596739016a 100644
--- a/src/operator/nn/mkldnn/mkldnn_copy.cc
+++ b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -35,7 +35,13 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                 const NDArray &in_data, const OpReqType &req,
                 const NDArray &out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[0]);
-  auto in_mem = in_data.GetMKLDNNData();
+
+  // If the input data is a view of an MKLDNN array, we should create a new
+  // NDArray with reordered data.
+  NDArray data = in_data;
+  if (data.IsMKLDNNData() && data.IsView())
+    data = data.Reorder2Default();
+  auto in_mem = data.GetMKLDNNData();
   if (req == kAddTo) {
     TmpMemMgr::Get()->Init(ctx.requested[0]);
     // We should try and force the output memory has the same format
diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
index 2097d57ba92..4b6235ec446 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
@@ -92,12 +92,18 @@ inline bool SupportMKLDNNPooling(const PoolingParam &param,
 
   if (param.pooling_convention == pool_enum::kValid)
     return true;
+  else
+    return false;
 
+// need to support pooling convention full
+// https://issues.apache.org/jira/browse/MXNET-33
+#if 0
   if (((dshape[2] + 2 * param.pad[0] - param.kernel[0]) % param.stride[0] == 0) &&
       ((dshape[3] + 2 * param.pad[1] - param.kernel[1]) % param.stride[1] == 0))
     return true;
   else
     return false;
+#endif
 }
 
 inline bool MKLDNNRequireWorkspace(const PoolingParam &param) {
diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc
index ccad068e423..e8fec502f79 100644
--- a/src/operator/nn/mkldnn/mkldnn_sum.cc
+++ b/src/operator/nn/mkldnn/mkldnn_sum.cc
@@ -59,8 +59,15 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   std::vector<float> scales(inputs.size(), 1);
   in_prims.reserve(inputs.size());
   bool pd_same = true;
+  std::vector<NDArray> in_bufs(inputs.size());
   for (size_t i = 0; i < inputs.size(); i++) {
-    auto in_mem = inputs[i].GetMKLDNNData();
+    const mkldnn::memory *in_mem;
+    if (inputs[i].IsMKLDNNData() && inputs[i].IsView()) {
+      in_bufs[i] = inputs[i].Reorder2Default();
+      in_mem = in_bufs[i].GetMKLDNNData();
+    } else {
+      in_mem = inputs[i].GetMKLDNNData();
+    }
     in_prims.push_back(*in_mem);
     in_pds[i] = in_mem->get_primitive_desc();
   }
@@ -68,9 +75,16 @@ void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   mkldnn::sum::primitive_desc pdesc(scales, in_pds);
   pd_same = pd_same && (pdesc.dst_primitive_desc() == in_pds[0]);
   auto out_mem = const_cast<NDArray&>(out_data).CreateMKLDNNData(pdesc.dst_primitive_desc());
-  bool addr_same = out_mem->get_data_handle() == inputs[0].GetMKLDNNData()->get_data_handle();
-  if ((req == kWriteTo) ||
-      (req == kWriteInplace && pd_same && addr_same)) {
+  bool addr_same = false;
+  const void *first_data_handle;
+  if (in_bufs[0].is_none())
+    first_data_handle = inputs[0].GetMKLDNNData()->get_data_handle();
+  else
+    first_data_handle = in_bufs[0].GetMKLDNNData()->get_data_handle();
+  if (out_mem)
+    addr_same = out_mem->get_data_handle() == first_data_handle;
+  if (((req == kWriteTo) || (req == kWriteInplace && pd_same && addr_same))
+      && out_mem) {
     // do sum computation directly on output NDArray
     MKLDNNStream *stream = MKLDNNStream::Get();
     stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem));
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index d73edc72352..00469b03f12 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -43,16 +43,8 @@ static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs,
     return;
   } else if (inputs[0].storage_type() == kDefaultStorage
              && inputs[1].storage_type() == kDefaultStorage) {
-    // This happens if inputs are supposed to be in MKLDNN format
-    // but MKLDNN doesn't support the data type or the shape. We're
-    // forced to convert it to the default format.
-    std::vector<TBlob> in_blobs(2);
-    std::vector<TBlob> out_blobs(1);
-    in_blobs[0] = inputs[0].data();
-    in_blobs[1] = inputs[1].data();
-    out_blobs[0] = outputs[0].data();
-    ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>(attrs, ctx, in_blobs,
-                                                         req, out_blobs);
+    FallBackCompute(ElemwiseBinaryOp::Compute<cpu, op::mshadow_op::plus>,
+                    attrs, ctx, inputs, req, outputs);
     return;
   }
 #endif
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index 7dc05fda2cc..c39373b1b79 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -410,7 +410,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
           if (bwd_node_ptr) {
             CHECK_EQ(bwd_node_ptr->inputs.size(), num_inputs);
             input_types.resize(bwd_node_ptr->inputs.size(), -1);
-            for (size_t i = 0; i < num_inputs; ++i) {
+            for (int i = 0; i < num_inputs; ++i) {
               const int map_key = bwd_node_ptr->inputs[i].index;
               CHECK(index2array.find(map_key) != index2array.end());
               const int dtype = index2array[map_key]->dtype();
@@ -421,7 +421,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
               output_types.emplace_back(dtype);
             }
           } else {
-            for (size_t x = 0; x < num_inputs; ++x) {
+            for (int x = 0; x < num_inputs; ++x) {
               input_types.emplace_back(default_dtype());
             }
             for (const auto &fwd_inp : backward_for_op->inputs()) {
@@ -431,10 +431,10 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
           }
         } else {
           CHECK(false);  // above always true?
-          for (size_t x = 0; x < num_inputs; ++x) {
+          for (int x = 0; x < num_inputs; ++x) {
             input_types.emplace_back(default_dtype());
           }
-          for (size_t x = 0; x < inferred_num_outputs; ++x) {
+          for (int x = 0; x < inferred_num_outputs; ++x) {
             output_types.emplace_back(default_dtype());
           }
         }
@@ -455,7 +455,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
             if (bwd_node_ptr) {
               input_shapes.clear();
               CHECK_EQ(bwd_node_ptr->inputs.size(), num_inputs);
-              for (size_t i = 0; i < num_inputs; ++i) {
+              for (int i = 0; i < num_inputs; ++i) {
                 const int map_key = bwd_node_ptr->inputs[i].index;
                 CHECK(index2array.find(map_key) != index2array.end());
                 const nnvm::TShape &shp = index2array[map_key]->shape();
diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc
index c3e03df195e..5db4256c2ec 100644
--- a/tests/cpp/operator/mkldnn.cc
+++ b/tests/cpp/operator/mkldnn.cc
@@ -26,8 +26,11 @@
 #if MXNET_USE_MKLDNN == 1
 
 #include "gtest/gtest.h"
+#include "mxnet/imperative.h"
 #include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
 
+using namespace mxnet;
+
 #if __GNUC__ >= 5
 bool test_mem_align(void *mem, size_t size, size_t alignment, size_t space) {
   void *ret1, *ret2;
@@ -77,4 +80,566 @@ TEST(MKLDNN_UTIL_FUNC, AlignMem) {
   LOG(INFO) << "Skipped for GCC " << __GNUC__ << "." << __GNUC_MINOR__;
 #endif
 }
+
+// Init arrays with the default layout.
+static void InitArray(NDArray *arr, bool is_rand = false) {
+  const TBlob &blob = arr->data();
+  mshadow::default_real_t *data = blob.dptr<mshadow::default_real_t>();
+  size_t size = blob.Size();
+  if (is_rand) {
+    for (size_t i = 0; i < size; i++)
+      data[i] = std::rand();
+  } else {
+    for (size_t i = 0; i < size; i++)
+      data[i] = i;
+  }
+}
+
+// Init arrays with the specified layout.
+static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd,
+                            bool is_rand = false) {
+  const TBlob &blob = arr->data();
+  mshadow::default_real_t *data = blob.dptr<mshadow::default_real_t>();
+  size_t size = blob.Size();
+  if (is_rand) {
+    for (size_t i = 0; i < size; i++)
+      data[i] = std::rand();
+  } else {
+    for (size_t i = 0; i < size; i++)
+      data[i] = i;
+  }
+  arr->MKLDNNDataReorderAsync(pd);
+  arr->WaitToRead();
+}
+
+static void VerifyDefMem(const mkldnn::memory &mem) {
+  mkldnn::memory::primitive_desc pd = mem.get_primitive_desc();
+  mshadow::default_real_t *data
+      = static_cast<mshadow::default_real_t *>(mem.get_data_handle());
+  size_t size = pd.get_size() / sizeof(mshadow::default_real_t);
+  size_t num_same = 0;
+  for (size_t i = 0; i < size; i++)
+    num_same += data[i] == static_cast<mshadow::default_real_t>(i);
+  EXPECT_EQ(num_same, size);
+}
+
+static void VerifyMem(const mkldnn::memory &mem) {
+  mkldnn::memory::primitive_desc pd = mem.get_primitive_desc();
+
+  if (pd.desc().data.format == GetDefaultFormat(pd.desc())) {
+    VerifyDefMem(mem);
+  } else {
+    mkldnn::memory::dims dims(pd.desc().data.ndims);
+    for (size_t i = 0; i < dims.size(); i++)
+      dims[i] = pd.desc().data.dims[i];
+    mkldnn::memory::desc desc{dims,
+                              static_cast<mkldnn::memory::data_type>(pd.desc().data.data_type),
+                              static_cast<mkldnn::memory::format>(GetDefaultFormat(pd.desc()))};
+    mkldnn::memory::primitive_desc new_pd(desc, CpuEngine::Get()->get_engine());
+    mkldnn::memory new_mem(new_pd);
+
+    std::vector<mkldnn::primitive> net;
+    net.push_back(mkldnn::reorder(mem, new_mem));
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+    VerifyDefMem(new_mem);
+  }
+}
+
+static mkldnn::memory::primitive_desc GetMemPD(const TShape s, int dtype,
+                                               mkldnn::memory::format format) {
+  mkldnn::memory::dims dims(s.ndim());
+  for (size_t i = 0; i < dims.size(); i++)
+    dims[i] = s[i];
+  mkldnn::memory::desc desc{dims, get_mkldnn_type(dtype), format};
+  return mkldnn::memory::primitive_desc(desc, CpuEngine::Get()->get_engine());
+}
+
+// This function gets special MKLDNN formats without knowing the specific
+// hardware configuration. Certainly, it potentially misses some format if
+// it's specific for certain array shapes. It covers at least one special format
+// for each of the formats: nchw, oihw, goihw.
+// To test the logic of the code in NDArray, these formats should be enough.
+static std::vector<mkldnn::memory::format> GetMKLDNNFormat(size_t num_dims, int dtype) {
+  if (num_dims == 4) {
+    mkldnn::memory::dims data_dims{1, 3, 224, 224};
+    mkldnn::memory::desc data_md{data_dims, get_mkldnn_type(dtype),
+                                 mkldnn::memory::format::any};
+    mkldnn::memory::dims weight_dims{96, 3, 11, 11};
+    mkldnn::memory::desc weight_md{weight_dims, get_mkldnn_type(dtype),
+                                   mkldnn::memory::format::any};
+    mkldnn::memory::dims output_dims{1, 96, 54, 54};
+    mkldnn::memory::desc out_md{output_dims, get_mkldnn_type(dtype),
+                                mkldnn::memory::format::any};
+    mkldnn::memory::dims strides{4, 4};
+    mkldnn::memory::dims padding{0, 0};
+
+    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+                                           mkldnn::algorithm::convolution_direct,
+                                           data_md, weight_md, out_md, strides,
+                                           padding, padding, mkldnn::padding_kind::zero);
+    mkldnn::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine());
+    std::vector<mkldnn::memory::format> ret(2);
+    ret[0] = static_cast<mkldnn::memory::format>(pd.dst_primitive_desc().desc().data.format);
+    ret[1] = static_cast<mkldnn::memory::format>(pd.weights_primitive_desc().desc().data.format);
+    printf("format: %d, %d\n", ret[0], ret[1]);
+    return ret;
+  } else if (num_dims == 5) {
+    mkldnn::memory::dims data_dims{1, 32, 112, 112};
+    mkldnn::memory::desc data_md{data_dims, get_mkldnn_type(dtype),
+                                 mkldnn::memory::format::any};
+    mkldnn::memory::dims weight_dims{32, 1, 1, 3, 3};
+    mkldnn::memory::desc weight_md{weight_dims, get_mkldnn_type(dtype),
+                                   mkldnn::memory::format::any};
+    mkldnn::memory::dims output_dims{1, 32, 112, 112};
+    mkldnn::memory::desc out_md{output_dims, get_mkldnn_type(dtype),
+                                mkldnn::memory::format::any};
+    mkldnn::memory::dims strides{1, 1};
+    mkldnn::memory::dims padding{1, 1};
+
+    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+                                           mkldnn::algorithm::convolution_direct,
+                                           data_md, weight_md, out_md, strides,
+                                           padding, padding, mkldnn::padding_kind::zero);
+    mkldnn::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine());
+    std::vector<mkldnn::memory::format> ret(1);
+    ret[0] = static_cast<mkldnn::memory::format>(pd.weights_primitive_desc().desc().data.format);
+    printf("format: %d\n", ret[0]);
+    return ret;
+  } else {
+    return std::vector<mkldnn::memory::format>();
+  }
+}
+
+struct TestArrayShapes {
+  std::vector<nnvm::TShape> shapes;
+  std::vector<mkldnn::memory::primitive_desc> pds;
+};
+
+static TestArrayShapes GetTestArrayShapes() {
+  int dtype = mshadow::DataType<mshadow::default_real_t>::kFlag;
+  std::vector<TShape> shapes;
+  std::vector<mkldnn::memory::primitive_desc> pds;
+  {
+    // 1D
+    TShape s(1);
+    s[0] = 279936;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::x));
+    s[0] = 34848;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::x));
+  }
+  {
+    // 2D
+    TShape s(2);
+    s[0] = 96;
+    s[1] = 2916;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::nc));
+    s[0] = 96;
+    s[1] = 363;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::nc));
+  }
+  {
+    // 4D
+    TShape s1(4);
+    s1[0] = 10; s1[1] = 96; s1[2] = 54; s1[3] = 54;
+    shapes.push_back(s1);
+    pds.push_back(GetMemPD(s1, dtype, mkldnn::memory::format::nchw));
+
+    TShape s2(4);
+    s2[0] = 96; s2[1] = 3; s2[2] = 11; s2[3] = 11;
+    shapes.push_back(s2);
+    pds.push_back(GetMemPD(s2, dtype, mkldnn::memory::format::oihw));
+
+    std::vector<mkldnn::memory::format> formats = GetMKLDNNFormat(4, dtype);
+    pds.push_back(GetMemPD(s1, dtype, formats[0]));
+    pds.push_back(GetMemPD(s2, dtype, formats[1]));
+  }
+  {
+    // 5D
+    TShape s(5);
+    s[0] = 96; s[1] = 1; s[2] = 3; s[3] = 11; s[4] = 11;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::goihw));
+
+    std::vector<mkldnn::memory::format> formats = GetMKLDNNFormat(5, dtype);
+    pds.push_back(GetMemPD(s, dtype, formats[0]));
+  }
+
+  TestArrayShapes ret;
+  ret.shapes = shapes;
+  ret.pds = pds;
+  return ret;
+}
+
+TEST(MKLDNN_NDArray, GetDataReorder) {
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<TShape> shapes = tas.shapes;
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+
+  // Reorder from the default to any other layout.
+  for (auto s : shapes) {
+    NDArray arr(s, Context());
+    InitArray(&arr);
+    for (auto pd : pds) {
+      if (s.Size() == pd.get_size() / sizeof(mshadow::default_real_t)) {
+        const mkldnn::memory *mem = arr.GetMKLDNNDataReorder(pd);
+        printf("reorder from (");
+        for (size_t i = 0; i < s.ndim(); i++)
+          printf("%ld, ", s[i]);
+        printf(") to (");
+        for (int i = 0; i < pd.desc().data.ndims; i++)
+          printf("%d, ", pd.desc().data.dims[i]);
+        printf("), format: %d\n", pd.desc().data.format);
+        MKLDNNStream::Get()->Submit(false);
+        VerifyMem(*mem);
+        MKLDNNStream::Get()->Cleanup();
+      }
+    }
+  }
+
+  // Reorder from a special layout to another layout.
+  for (auto s : shapes) {
+    for (auto from_pd : pds) {
+      if (from_pd.get_size() / sizeof(mshadow::default_real_t) == s.Size()) {
+        NDArray arr(s, Context());
+        // There is possibility that the dimensions of an NDArray doesn't match
+        // with the MKLDNN memory inside.
+        printf("Init array (");
+        for (size_t i = 0; i < s.ndim(); i++)
+          printf("%ld, ", s[i]);
+        printf(") with MKLDNN memory (");
+        for (int i = 0; i < from_pd.desc().data.ndims; i++)
+          printf("%d, ", from_pd.desc().data.dims[i]);
+        printf("), format: %d\n", from_pd.desc().data.format);
+        InitMKLDNNArray(&arr, from_pd);
+        for (auto to_pd : pds) {
+          if (to_pd.get_size() / sizeof(mshadow::default_real_t) == s.Size()) {
+            const mkldnn::memory *mem = arr.GetMKLDNNDataReorder(to_pd);
+            printf("reorder from (");
+            for (size_t i = 0; i < s.ndim(); i++)
+              printf("%ld, ", s[i]);
+            printf("), format: %d to (",
+                   arr.GetMKLDNNData()->get_primitive_desc().desc().data.format);
+            for (int i = 0; i < to_pd.desc().data.ndims; i++)
+              printf("%d, ", to_pd.desc().data.dims[i]);
+            printf("), format: %d\n", to_pd.desc().data.format);
+            MKLDNNStream::Get()->Submit(false);
+            VerifyMem(*mem);
+            MKLDNNStream::Get()->Cleanup();
+          }
+        }
+      }
+    }
+  }
+}
+
+struct OpAttrs {
+  nnvm::NodeAttrs attrs;
+  std::vector<DispatchMode> dispatches;
+};
+
+OpAttrs GetCopyOp() {
+  OpAttrs attrs;
+  attrs.attrs.op = Op::Get("_copy");
+  attrs.dispatches.resize(2);
+  attrs.dispatches[0] = DispatchMode::kFCompute;
+  attrs.dispatches[1] = DispatchMode::kFComputeEx;
+  return attrs;
+}
+
+OpAttrs GetLeakyReluOp() {
+  OpAttrs attrs;
+  attrs.attrs.op = Op::Get("LeakyReLU");
+  attrs.dispatches.resize(1);
+  attrs.dispatches[0] = DispatchMode::kFCompute;
+  return attrs;
+}
+
+OpAttrs GetSumOp() {
+  OpAttrs attrs;
+  attrs.attrs.op = Op::Get("elemwise_add");
+  attrs.dispatches.resize(2);
+  attrs.dispatches[0] = DispatchMode::kFCompute;
+  attrs.dispatches[1] = DispatchMode::kFComputeEx;
+  return attrs;
+}
+
+/*
+ * We want to get a few types of NDArrays for testing:
+ * 1. Normal NDArray
+ * 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
+ * 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
+ *    dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
+ *    type of NDArrays only exists for weight arrays. I don't think we should
+ *    pass them to all operators.
+ *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    reordered to 5 dimensions.
+ * 4. Reshaped/sliced NDArray
+ * 5. Reshaped/sliced NDArray with MKLDNN layout (reshape/slice from Normal NDArray
+ *    with MKLDNN layout)
+ * 6. Reshaped/sliced NDArray with MKLDNN layout whose MKLDNN memory may have
+ *    different dimensions from the NDArray (result of MKLDNNDataReorderAsync).
+ *    However, this type of NDArrays only exists for weight arrays. I don't think
+ *    we should pass them to all operators.
+ *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    reordered to 5 dimensions.
+ *
+ */
+std::vector<NDArray> GetTestInputArrays() {
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<nnvm::TShape> shapes = tas.shapes;
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+  std::vector<NDArray> in_arrs;
+  for (auto shape : shapes) {
+    in_arrs.emplace_back(shape, Context());
+    InitArray(&in_arrs.back());
+    for (auto pd : pds) {
+      if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
+        continue;
+
+      in_arrs.emplace_back(shape, Context());
+      InitMKLDNNArray(&in_arrs.back(), pd);
+
+      // Get a sliced version.
+      NDArray arr(shape, Context());
+      InitMKLDNNArray(&arr, pd);
+      arr = arr.Slice(1, arr.shape()[0] - 1);
+      in_arrs.emplace_back(arr);
+    }
+  }
+  return in_arrs;
+}
+
+/*
+ * We want to get a few types of NDArrays for testing:
+ * 1. Normal NDArray
+ * 2. Normal NDArray with MKLDNN layout (output from an MKLDNN operator)
+ * 3. Normal NDArray with MKLDNN layout whose MKLDNN memory may have different
+ *    dimensions from the NDArray (result of MKLDNNDataReorderAsync). However, this
+ *    type of NDArrays only exists for weight arrays. I don't think we should
+ *    pass them to all operators.
+ *    In the inference mode, the MKLDNN memory in the weight array will be
+ *    reordered to 5 dimensions.
+ * 4. Reshaped/sliced NDArray
+ * 5. Reused NDArray (this is created by the MXNet executor). This type of
+ *    NDArrays can only be used as output arrays.
+ * 6. Reused NDArray converted from an array with a different data type.
+ * 7. Reused reshaped/sliced NDArray.
+ * 8. Reused NDArray with MKLDNN layout.
+ * 9. Reused NDArray with MKLDNN layout of different dimensions.
+ */
+std::vector<NDArray> GetTestOutputArrays(const TShape &shape,
+                                         const std::vector<mkldnn::memory::primitive_desc> &pds) {
+  std::vector<NDArray> in_arrs;
+  // Type 1.
+  in_arrs.emplace_back(shape, Context());
+  InitArray(&in_arrs.back(), true);
+
+  // Type 4.
+  TShape tmp_shape = shape;
+  tmp_shape[0] = shape[0] * 2;
+  NDArray arr0(tmp_shape, Context());
+  InitArray(&arr0, true);
+  in_arrs.emplace_back(arr0.Slice(1, shape[0] + 1));
+
+  // Type 5.
+  // Get a reused version.
+  nnvm::TShape s(1);
+  s[0] = shape.Size();
+  NDArray arr1(s, Context());
+  arr1 = arr1.AsArray(shape, arr1.dtype());
+  InitArray(&arr1, true);
+  in_arrs.emplace_back(arr1);
+
+  // Type 6.
+  s[0] = shape.Size() * GetTypeSize(mshadow::default_type_flag);
+  NDArray arr2(s, Context(), true, mshadow::kUint8);
+  arr2 = arr2.AsArray(shape, mshadow::default_type_flag);
+  InitArray(&arr2, true);
+  in_arrs.emplace_back(arr2);
+
+  // Type 7
+  s[0] = shape.Size() * GetTypeSize(mshadow::default_type_flag) * 2;
+  NDArray arr3(s, Context(), true, mshadow::kUint8);
+  tmp_shape[0] = shape[0] * 2;
+  arr3 = arr3.AsArray(tmp_shape, mshadow::default_type_flag);
+  InitArray(&arr3, true);
+  in_arrs.emplace_back(arr3.Slice(1, shape[0] + 1));
+
+  for (auto pd : pds) {
+    if (shape.Size() != pd.get_size() / sizeof(mshadow::default_real_t))
+      continue;
+
+    // Type 2, 3.
+    in_arrs.emplace_back(shape, Context());
+    InitMKLDNNArray(&in_arrs.back(), pd, true);
+
+    // Type 8, 9.
+    // Get a reused version.
+    nnvm::TShape s(1);
+    s[0] = shape.Size();
+    NDArray arr = NDArray(s, Context());
+    arr = arr.AsArray(shape, arr.dtype());
+    InitMKLDNNArray(&arr, pd, true);
+    in_arrs.emplace_back(arr);
+  }
+  return in_arrs;
+}
+
+using VerifyFunc = std::function<void (const std::vector<NDArray *> &in_arrs, const NDArray &arr)>;
+
+void VerifyCopyResult(const std::vector<NDArray *> &in_arrs, const NDArray &arr) {
+  NDArray tmp1 = in_arrs[0]->Reorder2Default();
+  NDArray tmp2 = arr.Reorder2Default();
+  EXPECT_EQ(tmp1.shape().Size(), tmp2.shape().Size());
+  TBlob d1 = tmp1.data();
+  TBlob d2 = tmp2.data();
+  EXPECT_EQ(memcmp(d1.dptr_, d2.dptr_,
+                   tmp1.shape().Size() * sizeof(mshadow::default_real_t)), 0);
+}
+
+void VerifySumResult(const std::vector<NDArray *> &in_arrs, const NDArray &arr) {
+  NDArray in1 = in_arrs[0]->Reorder2Default();
+  NDArray in2 = in_arrs[1]->Reorder2Default();
+  NDArray out = arr.Reorder2Default();
+  EXPECT_EQ(in1.shape().Size(), in2.shape().Size());
+  EXPECT_EQ(in1.shape().Size(), out.shape().Size());
+
+  mshadow::default_real_t *d1 = in1.data().dptr<mshadow::default_real_t>();
+  mshadow::default_real_t *d2 = in2.data().dptr<mshadow::default_real_t>();
+  mshadow::default_real_t *o = out.data().dptr<mshadow::default_real_t>();
+  for (size_t i = 0; i < in1.shape().Size(); i++)
+    EXPECT_EQ(d1[i] + d2[i], o[i]);
+}
+
+TEST(MKLDNN_NDArray, CopyFrom) {
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+  std::vector<NDArray> in_arrs = GetTestInputArrays();
+  for (auto in_arr : in_arrs) {
+    std::vector<NDArray> out_arrs = GetTestOutputArrays(in_arr.shape(), pds);
+    for (auto out_arr : out_arrs) {
+      if (in_arr.IsMKLDNNData() && in_arr.IsView())
+        in_arr = in_arr.Reorder2Default();
+      const mkldnn::memory *mem = in_arr.GetMKLDNNData();
+      out_arr.CopyFrom(*mem);
+      MKLDNNStream::Get()->Submit();
+      std::vector<NDArray *> inputs(1);
+      inputs[0] = &in_arr;
+      VerifyCopyResult(inputs, out_arr);
+    }
+  }
+}
+
+void TestUnaryOp(const OpAttrs &attrs, VerifyFunc verify_fn) {
+  std::vector<NDArray*> inputs(1);
+  std::vector<NDArray*> outputs(1);
+  std::vector<OpReqType> req(1);
+  std::vector<DispatchMode> dispatches = attrs.dispatches;
+
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+  std::vector<NDArray> in_arrs = GetTestInputArrays();
+  for (auto in_arr : in_arrs) {
+    for (auto dispatch : dispatches) {
+      std::vector<NDArray> out_arrs = GetTestOutputArrays(in_arr.shape(), pds);
+      for (auto out_arr : out_arrs) {
+        req[0] = kWriteTo;
+        inputs[0] = &in_arr;
+        outputs[0] = &out_arr;
+        Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs,
+                                    outputs, req, dispatch, mxnet::OpStatePtr());
+        out_arr.WaitToRead();
+        verify_fn(inputs, out_arr);
+      }
+    }
+  }
+
+  for (auto dispatch : dispatches) {
+    in_arrs = GetTestInputArrays();
+    for (auto arr : in_arrs) {
+      // If the array is a view, we shouldn't write data to it.
+      if (arr.IsView())
+        continue;
+
+      NDArray orig = arr.Copy(arr.ctx());
+      req[0] = kWriteInplace;
+      inputs[0] = &arr;
+      outputs[0] = &arr;
+      Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs, outputs, req,
+                                  dispatch, mxnet::OpStatePtr());
+      arr.WaitToRead();
+      inputs[0] = &orig;
+      verify_fn(inputs, arr);
+    }
+  }
+}
+
+void TestBinaryOp(const OpAttrs &attrs, VerifyFunc verify_fn) {
+  std::vector<NDArray*> inputs(2);
+  std::vector<NDArray*> outputs(1);
+  std::vector<OpReqType> req(1);
+  std::vector<DispatchMode> dispatches = attrs.dispatches;
+
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+  std::vector<NDArray> in_arrs = GetTestInputArrays();
+  for (auto in_arr1 : in_arrs) {
+    for (auto dispatch : dispatches) {
+      std::vector<NDArray> out_arrs = GetTestOutputArrays(in_arr1.shape(), pds);
+      for (auto out_arr : out_arrs) {
+        req[0] = kWriteTo;
+        inputs[0] = &in_arr1;
+        inputs[1] = &in_arr1;
+        outputs[0] = &out_arr;
+        Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs,
+                                    outputs, req, dispatch, mxnet::OpStatePtr());
+        out_arr.WaitToRead();
+        verify_fn(inputs, out_arr);
+      }
+    }
+  }
+
+  for (auto dispatch : dispatches) {
+    in_arrs = GetTestInputArrays();
+    for (auto arr : in_arrs) {
+      // If the array is a view, we shouldn't write data to it.
+      if (arr.IsView())
+        continue;
+
+      NDArray orig = arr.Copy(arr.ctx());
+      req[0] = kWriteInplace;
+      inputs[0] = &arr;
+      inputs[1] = &arr;
+      outputs[0] = &arr;
+      Imperative::Get()->InvokeOp(Context(), attrs.attrs, inputs, outputs, req,
+                                  dispatch, mxnet::OpStatePtr());
+      arr.WaitToRead();
+      std::vector<NDArray *> orig_inputs(2);
+      orig_inputs[0] = &orig;
+      orig_inputs[1] = &orig;
+      verify_fn(orig_inputs, arr);
+    }
+  }
+}
+
+TEST(IMPERATIVE, UnaryOp) {
+  OpAttrs attrs = GetCopyOp();
+  TestUnaryOp(attrs, VerifyCopyResult);
+}
+
+
+TEST(IMPERATIVE, BinaryOp) {
+  OpAttrs attrs = GetSumOp();
+  TestBinaryOp(attrs, VerifySumResult);
+}
+
 #endif
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
index 378a822d193..273ad3d69ca 100644
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -81,15 +81,16 @@ def test_inference():
             gpu_param = gpu_params.get(k)
             gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu()))
 
-        # Run inference.
-        with autograd.record(train_mode=False):
-            cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
-            gpu_out = gpu_model(gpu_data)
-        out = cpu_out.asnumpy()
-        max_val = np.max(np.abs(out))
-        gpu_max_val = np.max(np.abs(gpu_out.asnumpy()))
-        eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val))
-        assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-3, atol=1e-3)
+        for i in range(5):
+            # Run inference.
+            with autograd.record(train_mode=False):
+                cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+                gpu_out = gpu_model(gpu_data)
+            out = cpu_out.asnumpy()
+            max_val = np.max(np.abs(out))
+            gpu_max_val = np.max(np.abs(gpu_out.asnumpy()))
+            eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val))
+            assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-3, atol=1e-3)
 
 def get_nn_model(name):
     if "densenet" in name:
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 08c749e597e..f06c3c3d601 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -1834,6 +1834,27 @@ def test_batchnorm_backwards_notrain():
                 loss=y.square().sum()
             loss.backward(train_mode=False)
 
+
+@with_seed()
+def test_softmax_activation():
+    gpu_a = mx.nd.array([[3., 0.5, -0.5, 2., 7.],
+        [2., -.4, 7.,   3., 0.2]], ctx=mx.gpu(0))
+    cpu_a = mx.nd.array([[3., 0.5, -0.5, 2., 7.],
+        [2., -.4, 7.,   3., 0.2]], ctx=mx.cpu())
+
+    cpu_a.attach_grad()
+    gpu_a.attach_grad()
+    with mx.autograd.record():
+        gpu_y = mx.nd.SoftmaxActivation(data = gpu_a)
+        cpu_y = mx.nd.SoftmaxActivation(data = cpu_a)
+        assert_almost_equal(cpu_y.asnumpy(), gpu_y.asnumpy(), atol = 1e-3, rtol = 1e-3)
+
+        gpu_y.backward()
+        cpu_y.backward()
+        assert_almost_equal(cpu_a.grad.asnumpy(), gpu_a.grad.asnumpy(),
+                atol = 1e-3, rtol = 1e-3)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/mkl/data/test_mkldnn_test_mkldnn_model_model1.json b/tests/python/mkl/data/test_mkldnn_test_mkldnn_model_model1.json
new file mode 100644
index 00000000000..ba822f57848
--- /dev/null
+++ b/tests/python/mkl/data/test_mkldnn_test_mkldnn_model_model1.json
@@ -0,0 +1,770 @@
+{
+  "nodes": [
+    {
+      "op": "null", 
+      "name": "data", 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv1_1_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "64", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv1_1_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "64", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv1_1", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "64", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu1_1", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[3, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv1_2_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "64", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv1_2_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "64", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv1_2", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "64", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[4, 0, 0], [5, 0, 0], [6, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu1_2", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[7, 0, 0]]
+    }, 
+    {
+      "op": "Pooling", 
+      "name": "pool1", 
+      "attrs": {
+        "kernel": "(2, 2)", 
+        "pool_type": "max", 
+        "stride": "(2, 2)"
+      }, 
+      "inputs": [[8, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv2_1_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "128", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv2_1_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "128", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv2_1", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "128", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[9, 0, 0], [10, 0, 0], [11, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu2_1", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[12, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv2_2_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "128", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv2_2_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "128", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv2_2", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "128", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[13, 0, 0], [14, 0, 0], [15, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu2_2", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[16, 0, 0]]
+    }, 
+    {
+      "op": "Pooling", 
+      "name": "pool2", 
+      "attrs": {
+        "kernel": "(2, 2)", 
+        "pool_type": "max", 
+        "stride": "(2, 2)"
+      }, 
+      "inputs": [[17, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv3_1_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv3_1_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv3_1", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[18, 0, 0], [19, 0, 0], [20, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu3_1", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[21, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv3_2_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv3_2_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv3_2", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[22, 0, 0], [23, 0, 0], [24, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu3_2", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[25, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv3_3_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv3_3_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv3_3", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "256", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[26, 0, 0], [27, 0, 0], [28, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu3_3", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[29, 0, 0]]
+    }, 
+    {
+      "op": "Pooling", 
+      "name": "pool3", 
+      "attrs": {
+        "kernel": "(2, 2)", 
+        "pool_type": "max", 
+        "pooling_convention": "full", 
+        "stride": "(2, 2)"
+      }, 
+      "inputs": [[30, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv4_1_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv4_1_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv4_1", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[31, 0, 0], [32, 0, 0], [33, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu4_1", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[34, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv4_2_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv4_2_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv4_2", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[35, 0, 0], [36, 0, 0], [37, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu4_2", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[38, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv4_3_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv4_3_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv4_3", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[39, 0, 0], [40, 0, 0], [41, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu4_3", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[42, 0, 0]]
+    }, 
+    {
+      "op": "Pooling", 
+      "name": "pool4", 
+      "attrs": {
+        "kernel": "(2, 2)", 
+        "pool_type": "max", 
+        "stride": "(2, 2)"
+      }, 
+      "inputs": [[43, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv5_1_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv5_1_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv5_1", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[44, 0, 0], [45, 0, 0], [46, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu5_1", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[47, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv5_2_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv5_2_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv5_2", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[48, 0, 0], [49, 0, 0], [50, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu5_2", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[51, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "conv5_3_weight", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "conv5_3_bias", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "conv5_3", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "num_filter": "512", 
+        "pad": "(1, 1)"
+      }, 
+      "inputs": [[52, 0, 0], [53, 0, 0], [54, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu5_3", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[55, 0, 0]]
+    }, 
+    {
+      "op": "Pooling", 
+      "name": "pool5", 
+      "attrs": {
+        "kernel": "(3, 3)", 
+        "pad": "(1, 1)", 
+        "pool_type": "max", 
+        "stride": "(1, 1)"
+      }, 
+      "inputs": [[56, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "fc6_weight", 
+      "attrs": {
+        "dilate": "(6, 6)", 
+        "kernel": "(3, 3)", 
+        "num_filter": "1024", 
+        "pad": "(6, 6)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "fc6_bias", 
+      "attrs": {
+        "dilate": "(6, 6)", 
+        "kernel": "(3, 3)", 
+        "num_filter": "1024", 
+        "pad": "(6, 6)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "fc6", 
+      "attrs": {
+        "dilate": "(6, 6)", 
+        "kernel": "(3, 3)", 
+        "num_filter": "1024", 
+        "pad": "(6, 6)"
+      }, 
+      "inputs": [[57, 0, 0], [58, 0, 0], [59, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu6", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[60, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "fc7_weight", 
+      "attrs": {
+        "kernel": "(1, 1)", 
+        "num_filter": "1024", 
+        "pad": "(0, 0)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "fc7_bias", 
+      "attrs": {
+        "kernel": "(1, 1)", 
+        "num_filter": "1024", 
+        "pad": "(0, 0)"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "fc7", 
+      "attrs": {
+        "kernel": "(1, 1)", 
+        "num_filter": "1024", 
+        "pad": "(0, 0)"
+      }, 
+      "inputs": [[61, 0, 0], [62, 0, 0], [63, 0, 0]]
+    }, 
+    {
+      "op": "Activation", 
+      "name": "relu7", 
+      "attrs": {"act_type": "relu"}, 
+      "inputs": [[64, 0, 0]]
+    }, 
+    {
+      "op": "Pooling", 
+      "name": "global_pool", 
+      "attrs": {
+        "global_pool": "True", 
+        "kernel": "(7, 7)", 
+        "pool_type": "avg"
+      }, 
+      "inputs": [[65, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "fc8_weight", 
+      "attrs": {
+        "kernel": "(1, 1)", 
+        "num_filter": "1000"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "null", 
+      "name": "fc8_bias", 
+      "attrs": {
+        "kernel": "(1, 1)", 
+        "num_filter": "1000"
+      }, 
+      "inputs": []
+    }, 
+    {
+      "op": "Convolution", 
+      "name": "fc8", 
+      "attrs": {
+        "kernel": "(1, 1)", 
+        "num_filter": "1000"
+      }, 
+      "inputs": [[66, 0, 0], [67, 0, 0], [68, 0, 0]]
+    }, 
+    {
+      "op": "Flatten", 
+      "name": "flatten0", 
+      "inputs": [[69, 0, 0]]
+    }, 
+    {
+      "op": "null", 
+      "name": "softmax_label", 
+      "inputs": []
+    }, 
+    {
+      "op": "SoftmaxOutput", 
+      "name": "softmax", 
+      "inputs": [[70, 0, 0], [71, 0, 0]]
+    }
+  ], 
+  "arg_nodes": [
+    0, 
+    1, 
+    2, 
+    5, 
+    6, 
+    10, 
+    11, 
+    14, 
+    15, 
+    19, 
+    20, 
+    23, 
+    24, 
+    27, 
+    28, 
+    32, 
+    33, 
+    36, 
+    37, 
+    40, 
+    41, 
+    45, 
+    46, 
+    49, 
+    50, 
+    53, 
+    54, 
+    58, 
+    59, 
+    62, 
+    63, 
+    67, 
+    68, 
+    71
+  ], 
+  "node_row_ptr": [
+    0, 
+    1, 
+    2, 
+    3, 
+    4, 
+    5, 
+    6, 
+    7, 
+    8, 
+    9, 
+    11, 
+    12, 
+    13, 
+    14, 
+    15, 
+    16, 
+    17, 
+    18, 
+    19, 
+    21, 
+    22, 
+    23, 
+    24, 
+    25, 
+    26, 
+    27, 
+    28, 
+    29, 
+    30, 
+    31, 
+    32, 
+    33, 
+    35, 
+    36, 
+    37, 
+    38, 
+    39, 
+    40, 
+    41, 
+    42, 
+    43, 
+    44, 
+    45, 
+    46, 
+    47, 
+    49, 
+    50, 
+    51, 
+    52, 
+    53, 
+    54, 
+    55, 
+    56, 
+    57, 
+    58, 
+    59, 
+    60, 
+    61, 
+    63, 
+    64, 
+    65, 
+    66, 
+    67, 
+    68, 
+    69, 
+    70, 
+    71, 
+    72, 
+    73, 
+    74, 
+    75, 
+    76, 
+    77, 
+    78
+  ], 
+  "heads": [[72, 0, 0]], 
+  "attrs": {"mxnet_version": ["int", 10200]}
+}
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
new file mode 100644
index 00000000000..2caf7af7eb4
--- /dev/null
+++ b/tests/python/mkl/test_mkldnn.py
@@ -0,0 +1,217 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+MKL-DNN related test cases
+"""
+import sys
+import os
+import numpy as np
+import mxnet as mx
+from mxnet.test_utils import assert_almost_equal
+from mxnet import gluon
+from mxnet.gluon import nn
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, '../unittest/'))
+from common import with_seed
+
+
+def test_mkldnn_model():
+    model = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
+                         "test_mkldnn_test_mkldnn_model_model1.json")
+    shape = (32, 3, 300, 300)
+    ctx = mx.cpu()
+
+    sym = mx.sym.load(model)
+    args = sym.list_arguments()
+    shapes = sym.infer_shape(data=shape)
+
+    def get_tensors(args, shapes, ctx):
+        return {x: mx.nd.ones(y, ctx) for x, y in zip(args, shapes)}
+
+    inputs = get_tensors(args, shapes[0], ctx)
+    grads = get_tensors(args, shapes[0], ctx)
+
+    try:
+        exe = sym.bind(ctx, inputs, args_grad=grads)
+        for _ in range(2):
+            exe.forward(is_train=True)
+            for y in exe.outputs:
+                y.wait_to_read()
+            exe.backward()
+            for y in exe.grad_arrays:
+                y.wait_to_read()
+    except:  # pylint: disable=bare-except
+        assert 0, "test_mkldnn_model exception in bind and execution"
+
+def test_mkldnn_ndarray_slice():
+    ctx = mx.cpu()
+    net = gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(gluon.nn.Conv2D(channels=32, kernel_size=3, activation=None))
+    net.collect_params().initialize(ctx=ctx)
+    x = mx.nd.array(np.ones([32, 3, 224, 224]), ctx)
+    y = net(x)
+
+    # trigger computation on ndarray slice
+    assert_almost_equal(y[0].asnumpy()[0, 0, 0], 0.3376348)
+
+def test_mkldnn_engine_threading():
+    net = gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(gluon.nn.Conv2D(channels=32, kernel_size=3, activation=None))
+    net.collect_params().initialize(ctx=mx.cpu())
+    class Dummy(gluon.data.Dataset):
+        def __len__(self):
+            return 2
+        def __getitem__(self, key):
+            return key, np.ones((3, 224, 224)), np.ones((10, ))
+
+    loader = gluon.data.DataLoader(Dummy(), batch_size=2, num_workers=1)
+
+    X = (32, 3, 32, 32)
+    # trigger mkldnn execution thread
+    y = net(mx.nd.array(np.ones(X))).asnumpy()
+
+    # Use Gluon dataloader to trigger different thread.
+    # below line triggers different execution thread
+    for _ in loader:
+        y = net(mx.nd.array(np.ones(X))).asnumpy()
+        # output should be 016711406 (non-mkldnn mode output) 
+        assert_almost_equal(y[0, 0, 0, 0], 0.016711406)
+        break
+
+
+@with_seed()
+def test_reshape_before_conv():
+    class Net(gluon.HybridBlock):
+        """
+        test Net
+        """
+        def __init__(self, **kwargs):
+            super(Net, self).__init__(**kwargs)
+            with self.name_scope():
+                self.conv0 = nn.Conv2D(10, (3, 3))
+                self.conv1 = nn.Conv2D(5, (3, 3))
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            x_reshape = x.reshape((0, 0, 20, 5))
+            y = self.conv0(x_reshape)
+            y_reshape = y.reshape((0, 0, 9, 6))
+            out = self.conv1(y_reshape)
+            return out
+    x = mx.nd.random.uniform(shape=(2, 4, 10, 10))
+    x.attach_grad()
+    net = Net()
+    net.collect_params().initialize()
+    with mx.autograd.record():
+        out1 = net(x)
+    out1.backward()
+    dx1 = x.grad
+    net.hybridize()
+    with mx.autograd.record():
+        out2 = net(x)
+    out2.backward()
+    mx.test_utils.assert_almost_equal(dx1.asnumpy(), x.grad.asnumpy(), rtol=1e-5, atol=1e-6)
+    mx.test_utils.assert_almost_equal(out1.asnumpy(), out2.asnumpy(), rtol=1e-5, atol=1e-6)
+
+
+@with_seed()
+def test_slice_before_conv():
+    class Net(gluon.HybridBlock):
+        """
+        test Net
+        """
+        def __init__(self, **kwargs):
+            super(Net, self).__init__(**kwargs)
+            with self.name_scope():
+                self.conv0 = nn.Conv2D(4, (3, 3))
+                self.conv1 = nn.Conv2D(4, (3, 3))
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            x_slice = x.slice(begin=(0, 0, 0, 0), end=(2, 4, 10, 10))
+            y = self.conv0(x_slice)
+            y_slice = y.slice(begin=(1, 0, 2, 2), end=(2, 1, 7, 7))
+            out = self.conv1(y_slice)
+            return out
+    x = mx.nd.random.uniform(shape=(2, 10, 10, 10))
+    x.attach_grad()
+    net = Net()
+    net.collect_params().initialize()
+    with mx.autograd.record():
+        out1 = net(x)
+    out1.backward()
+    dx1 = x.grad
+    net.hybridize()
+    with mx.autograd.record():
+        out2 = net(x)
+    out2.backward()
+    mx.test_utils.assert_almost_equal(dx1.asnumpy(), x.grad.asnumpy(), rtol=1e-5, atol=1e-6)
+    mx.test_utils.assert_almost_equal(out1.asnumpy(), out2.asnumpy(), rtol=1e-5, atol=1e-6)
+
+
+@with_seed()
+def test_slice_reshape_before_conv():
+    class Net(gluon.HybridBlock):
+        """
+        test Net
+        """
+        def __init__(self, **kwargs):
+            super(Net, self).__init__(**kwargs)
+            with self.name_scope():
+                self.conv0 = nn.Conv2D(4, (3, 3))
+                self.conv1 = nn.Conv2D(4, (3, 3))
+
+        def hybrid_forward(self, F, x, *args, **kwargs):
+            x_slice = x.slice(begin=(0, 0, 0, 0), end=(2, 4, 8, 9))
+            y = self.conv0(x_slice)
+            y_reshape = y.reshape((0, 0, 14, 3))
+            out = self.conv1(y_reshape)
+            return out
+    x = mx.nd.random.uniform(shape=(2, 10, 10, 10))
+    x.attach_grad()
+    net = Net()
+    net.collect_params().initialize()
+    with mx.autograd.record():
+        out1 = net(x)
+    out1.backward()
+    dx1 = x.grad
+    net.hybridize()
+    with mx.autograd.record():
+        out2 = net(x)
+    out2.backward()
+    mx.test_utils.assert_almost_equal(dx1.asnumpy(), x.grad.asnumpy(), rtol=1e-5, atol=1e-6)
+    mx.test_utils.assert_almost_equal(out1.asnumpy(), out2.asnumpy(), rtol=1e-5, atol=1e-6)
+
+
+def test_mkldnn_sum_inplace_with_cpu_layout():
+
+    x_shape = (32, 3, 224, 224)
+    x_npy = np.ones(x_shape)
+    y_shape = (32, 32, 222, 222)
+    y_npy = np.ones(y_shape)
+    x = mx.sym.Variable("x")
+    y = mx.sym.Variable("y")
+    z = mx.symbol.Convolution(data=x, num_filter=32, kernel=(3, 3))
+    z = mx.sym.add_n(z, y)
+    exe = z.simple_bind(ctx=mx.cpu(), x=x_shape, y=y_shape)
+    out = exe.forward(is_train=False, x=x_npy, y=y_npy)[0]
+    assert_almost_equal(out[0].asnumpy()[0, 0, 0], 1.0)
+
+
+if __name__ == '__main__':
+    test_mkldnn_install()
diff --git a/tests/python/mkl/test_mkldnn_install.py b/tests/python/mkl/test_mkldnn_install.py
new file mode 100644
index 00000000000..c2f26df72f2
--- /dev/null
+++ b/tests/python/mkl/test_mkldnn_install.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+MKL-DNN related test cases
+"""
+
+import sys
+import os
+import logging
+
+
+def test_mkldnn_install():
+    """
+    This test will verify that MXNet is built/installed correctly when
+    compiled with Intel MKL-DNN library. The method will try to import
+    the mxnet module and see if the mkldnn library is mapped to this
+    process's address space.
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    if not sys.platform.startswith('linux'):
+        logging.info("Bypass mkldnn install test for non-Linux OS")
+        return
+
+    try:
+        #pylint: disable=unused-variable
+        import mxnet as mx
+    except (ImportError, OSError) as e:
+        assert 0, "Import mxnet error: %s. Please double check your build/" \
+            "install steps or environment variable settings" % str(e)
+
+    pid = os.getpid()
+    rc = os.system("cat /proc/" + str(pid) +
+                   "/maps | grep libmkldnn > /dev/null")
+
+    if rc == 0:
+        logging.info("MXNet is built/installed correctly with MKL-DNN")
+    else:
+        assert 0, "MXNet is built/installed incorrectly with MKL-DNN, please " \
+            "double check your build/install steps or environment " \
+            "variable settings"


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services